diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml index 12b8cdcb8eedf..01a8cfc4b0df6 100644 --- a/.github/workflows/cicd-main.yml +++ b/.github/workflows/cicd-main.yml @@ -871,318 +871,6 @@ jobs: pretrained_model=${OUTPUT_DIR}/HeteronymClassification/test/checkpoints/HeteronymClassification.nemo \ output_manifest=preds.json - # L2: Dialogue Classification - - # TODO: pleasefixme - # L2_Dialogue_Classification_Dialogue_Intent_and_slot_classification_using_GPT: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure-gpus-1 - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # cd examples/nlp/dialogue && \ - # python dialogue.py \ - # model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - # model.language_model.lm_checkpoint=/home/TestData/nlp/gpt2/pytorch_model.bin\ - # model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - # model.dataset.dialogues_example_dir=sgd_gen_outputs \ - # model.dataset.task_name=debug_sample \ - # trainer.max_steps=1 \ - # trainer.max_epochs=1 \ - # model.train_ds.batch_size=2 \ - # model.validation_ds.batch_size=2 \ - # model.test_ds.batch_size=2 \ - # model.nemo_path=null \ - # trainer.val_check_interval=0.0 \ - # trainer.devices=1 \ - # model.dataset.use_cache=false \ - # model.tokenizer.special_tokens={pad_token:"endoftext"} \ - # model.tokenizer.tokenizer_name=gpt2 \ - # model.tokenizer.vocab_file=/home/TestData/nlp/gpt2/vocab.json\ - # model.language_model.pretrained_model_name=/home/TestData/nlp/gpt2 \ - # trainer.accelerator=gpu \ - # exp_manager=null && \ - # rm -rf sgd_gen_outputs - - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_gen_bert_outputs \ - model.dataset.task_name=debug_sample \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.dataset.num_tasks=6 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-cased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_outputs - - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - model.dataset.data_dir=/home/TestData/nlp/processed_assistant \ - model.dataset.dialogues_example_dir=sgd_gen_bert_intent_classification_outputs \ - model.dataset.task=assistant \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_bert_intent_classification_outputs - - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/drive_thru_revised \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=sgd_gen_zero_shot_intent_classification_outputs \ - model.dataset.task=zero_shot \ - model.dataset.prompt_template="This example is" \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf sgd_gen_zero_shot_intent_classification_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=megatron \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.original_nemo_checkpoint=/home/TestData/nlp/drive_thru_revised/zeroshotintent_en_bert_base_uncased.nemo \ - model.dataset.dialogues_example_dir=design_zero_shot_intent_classification_bart_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="This example is related to" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=bert-base-uncased \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_zero_shot_intent_classification_bart_outputs - - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/design_dataset \ - model.dataset.dialogues_example_dir=design_dialogue_nearest_neighbour_classification_outputs \ - model.dataset.task=design \ - model.dataset.prompt_template="" \ - model.library=huggingface \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=sentence-transformers/all-MiniLM-L6-v2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf design_dialogue_nearest_neighbour_classification_outputs - - # L2: Dialogue Generation - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender_s2s \ - model.dataset.task=ms_marco \ - model.library=huggingface \ - model.dataset.debug_mode=True \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender_s2s - - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/sgd_small \ - model.dataset.dialogues_example_dir=sgd_answer_extender_s2s \ - model.dataset.task_name=debug_sample \ - model.dataset.task=sgd_generation \ - model.dataset.input_field=utterance+system_actions \ - model.dataset.output_field=system_utterance \ - model.dataset.use_cache=false \ - model.dataset.system_utterance=next_turn \ - model.dataset.debug_mode=True \ - model.dataset.prompt_template=slots_values \ - model.library=huggingface \ - trainer.max_steps=1 \ - trainer.max_epochs=1 \ - model.train_ds.batch_size=2 \ - model.validation_ds.batch_size=2 \ - model.test_ds.batch_size=2 \ - model.nemo_path=null \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.language_model.pretrained_model_name=facebook/bart-large \ - trainer.accelerator=gpu \ - exp_manager=null - AFTER_SCRIPT: | - rm -rf sgd_answer_extender_s2s - -# - name: L2: Dialogue Generation Part 2 -# when { -# anyOf { -# branch main -# changeRequest target: main -# } -# } -# failFast true -# parallel { -# - name: Dialogue: Answer Extender using DialogueGPTGenerationModel -# - run: | -# cd examples/nlp/dialogue && \ -# python dialogue.py \ -# do_training=False \ -# model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ -# model.dataset.dialogues_example_dir=answer_extender \ -# model.library=huggingface \ -# model.dataset.task=ms_marco \ -# model.dataset.debug_mode=True \ -# trainer.val_check_interval=0.0 \ -# trainer.devices=1 \ -# model.dataset.use_cache=false \ -# model.language_model.pretrained_model_name=gpt2 \ -# trainer.accelerator=gpu \ -# exp_manager=null && \ -# rm -rf answer_extender -# } -# } -# } -# } - - # L2: COPY - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/dialogue && \ - python dialogue.py \ - do_training=False \ - model.dataset.data_dir=/home/TestData/nlp/ms-marco-qa \ - model.dataset.dialogues_example_dir=answer_extender \ - model.library=huggingface \ - model.dataset.task=ms_marco \ - model.dataset.debug_mode=True \ - trainer.val_check_interval=0.0 \ - trainer.devices=1 \ - model.dataset.use_cache=false \ - model.language_model.pretrained_model_name=gpt2 \ - trainer.accelerator=gpu \ - exp_manager=null && \ - rm -rf answer_extender - # L2: Duplex Text Normalization L2_Duplex_Text_Normalization_with_Tarred_dataset: needs: [cicd-test-container-setup] @@ -1212,216 +900,6 @@ jobs: data.test_ds.use_cache=false \ data.test_ds.data_path=/home/TestData/nlp/duplex_text_norm/small_test.tsv -# Runs out of memory on the 12G TITAN V (GPU 0 on main CI) -# TODO: add when megatron bert is supported again in NeMo -# - name: L2: MegaBERT Token Classification -# when { -# anyOf { -# branch main -# changeRequest target: main -# } -# } -# failFast true -# - run: | -# cd examples/nlp/token_classification && \ -# python token_classification_train.py \ -# model.dataset.data_dir=/home/TestData/nlp/token_classification_punctuation/ \ -# model.language_model.pretrained_model_name=megatron-bert-345m-uncased \ -# model.train_ds.batch_size=10 \ -# model.dataset.max_seq_length=50 \ -# model.dataset.use_cache=false \ -# trainer.accelerator=gpu \ -# trainer.strategy=ddp \ -# trainer.precision=16 \ -# trainer.devices=1 \ -# trainer.accelerator="gpu" \ -# +trainer.fast_dev_run=true \ -# exp_manager=null -# } -# } - - # L2: BERT Text Classification - L2_BERT_Text_Classification_with_BERT_Test: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/text_classification && \ - python text_classification_with_bert.py \ - model.dataset.num_classes=6 \ - model.train_ds.file_path=/home/TestData/nlp/retail_text_classification/train.tsv \ - model.validation_ds.file_path=/home/TestData/nlp/retail_text_classification/dev.tsv \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.train_ds.batch_size=10 \ - model.dataset.max_seq_length=50 \ - model.dataset.use_cache=false \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - +trainer.fast_dev_run=true \ - exp_manager=null - - # L2: Parallel BERT Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - # Cannot do fast_dev_run because squad needs whole dev dataset - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=bert-base-uncased \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - # L2: Parallel BART Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_BART_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_BART_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=facebook/bart-base \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - # L2: Parallel GPT2 Question-Answering SQUAD v1.1 & v2.0 - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v1.1/train-v1.1.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.test_ds.file=/home/TestData/nlp/squad_mini/v1.1/dev-v1.1.json \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - model.test_ds.num_samples=2 \ - model.test_ds.batch_size=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=false \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null - - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - cd examples/nlp/question_answering && \ - python question_answering.py \ - model.train_ds.file=/home/TestData/nlp/squad_mini/v2.0/train-v2.0.json \ - model.dataset.use_cache=false \ - model.dataset.check_if_answer_in_context=false \ - model.train_ds.batch_size=2 \ - model.train_ds.num_samples=2 \ - model.validation_ds.batch_size=2 \ - model.validation_ds.num_samples=2 \ - trainer.max_epochs=1 \ - trainer.max_steps=1 \ - model.validation_ds.file=/home/TestData/nlp/squad_mini/v2.0/dev-v2.0.json \ - model.language_model.pretrained_model_name=gpt2 \ - model.dataset.version_2_with_negative=true \ - trainer.precision=16 \ - trainer.devices=1 \ - trainer.accelerator="gpu" \ - exp_manager=null # L2: Intent and Slot Classification Tasks L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification: @@ -1653,241 +1131,7 @@ jobs: pretrained_model=/home/TestData/nlp/pretrained_models/Punctuation_Capitalization_with_DistilBERT_base_uncased.nemo; rm -rf "${data_dir}" - - - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/token_classification && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tmp_data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${tmp_data_dir}"/ && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir}" \ - model.validation_ds.ds_item="${tmp_data_dir}" \ - model.test_ds.ds_item="${tmp_data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=true && \ - tmp_data_dir_2="$(mktemp -d -p "$(pwd)")" && \ - mv "${tmp_data_dir}"/* "${tmp_data_dir_2}" && \ - rm -rf "${tmp_data_dir}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${tmp_data_dir_2}" \ - model.validation_ds.ds_item="${tmp_data_dir_2}" \ - model.test_ds.ds_item="${tmp_data_dir_2}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null; - rm -rf /workspace/NeMo/examples/nlp/token_classification/nemo_experiments \ - "${tmp_data_dir_2}" \ - "${output_dir}" - - # Punctuation & Capitalization tarred dataset: - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - data_dir="$(mktemp -d -p "$(pwd)")" && \ - cp -r /home/TestData/nlp/token_classification_punctuation/*.txt \ - /home/TestData/nlp/token_classification_punctuation/wmt_wiki_10000 \ - "${data_dir}"/ && \ - usual_data=${data_dir}/wmt_wiki_10000 && \ - output_dir="$(mktemp -d -p "$(pwd)")" && \ - tarred_data=${output_dir}/train_tarred && \ - tokens_in_batch=2000 && \ - max_seq_length=512 && \ - lm_model=distilbert-base-uncased && \ - python examples/nlp/token_classification/data/create_punctuation_capitalization_tarred_dataset.py \ - --text ${usual_data}/input.txt \ - --labels ${usual_data}/labels.txt \ - --output_dir ${tarred_data} \ - --tokens_in_batch ${tokens_in_batch} \ - --max_seq_length 512 \ - --lines_per_dataset_fragment 2000 \ - --num_batches_per_tarfile 5 \ - --tar_file_prefix punctuation_capitalization \ - --tokenizer_name ${lm_model} \ - --use_fast_tokenizer \ - --pad_label O \ - --n_jobs 3 && \ - echo "Number of tarred files in dataset:" && \ - ls ${tarred_data}/*.tar | wc -l && \ - echo "Label id files in dataset:" && \ - ls ${tarred_data}/*.csv && \ - metadata_file=${tarred_data}/metadata.punctuation_capitalization.tokens${tokens_in_batch}.max_seq_length${max_seq_length}.${lm_model}.json && \ - python examples/nlp/token_classification/punctuation_capitalization_train_evaluate.py \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.train_ds.ds_item=${tarred_data} \ - model.language_model.pretrained_model_name=${lm_model} \ - model.train_ds.use_tarred_dataset=true \ - model.train_ds.tar_metadata_file=${metadata_file} \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.accelerator="gpu" \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir=${output_dir}/output; - - rm -rf "${output_dir}" "${data_dir}" - - # Punctuation_Capitalization_Different_ways_of_passing_labels_to_model - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/token_classification && \ - work_dir="$(mktemp -d -p "$(pwd)")" && \ - label_vocab_dir="${work_dir}/labels" && \ - mkdir -p ${label_vocab_dir} && \ - data_dir="${work_dir}/data" && \ - mkdir -p "${data_dir}" && \ - cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - output_dir="${work_dir}/output" && \ - mkdir -p "${output_dir}" && \ - punct_label_vocab="${label_vocab_dir}/punct_label_vocab.csv" && \ - capit_label_vocab="${label_vocab_dir}/capit_label_vocab.csv" && \ - printf "O\n,\n.\n?\n" > "${punct_label_vocab}" && \ - printf "O\nU\n" > "${capit_label_vocab}" && \ - python punctuation_capitalization_train_evaluate.py \ - model.train_ds.use_tarred_dataset=false \ - model.train_ds.ds_item="${data_dir}" \ - model.validation_ds.ds_item="${data_dir}" \ - model.test_ds.ds_item="${data_dir}" \ - model.language_model.pretrained_model_name=distilbert-base-uncased \ - model.common_dataset_parameters.label_vocab_dir="${label_vocab_dir}" \ - model.class_labels.punct_labels_file="$(basename "${punct_label_vocab}")" \ - model.class_labels.capit_labels_file="$(basename "${capit_label_vocab}")" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - +exp_manager.explicit_log_dir="${output_dir}" \ - +do_testing=false && \ - python punctuation_capitalization_train_evaluate.py \ - +do_training=false \ - +do_testing=true \ - ~model.train_ds \ - ~model.validation_ds \ - model.test_ds.ds_item="${data_dir}" \ - pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - +model.train_ds.use_cache=false \ - +model.validation_ds.use_cache=false \ - +model.test_ds.use_cache=false \ - trainer.devices=[0,1] \ - trainer.strategy=ddp \ - trainer.max_epochs=1 \ - exp_manager=null && \ - rm -rf "${work_dir}" - - # TODO: pleasefixme - # Punctuation_Capitalization_Using_model-common_datasets_parameters-punct-capit-_label_ids: - # needs: [cicd-test-container-setup] - # runs-on: self-hosted-azure - # container: - # image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} - # options: - # # --user 0:128 - # --device=/dev/nvidia0 - # --gpus all - # --shm-size=8g - # --env TRANSFORMERS_OFFLINE=0 - # --env HYDRA_FULL_ERROR=1 - # --volume /mnt/datadrive/TestData:/home/TestData - # steps: - # - name: Checkout repository - # uses: actions/checkout@v4 - # - run: | - # cd examples/nlp/token_classification && \ - # work_dir="$(mktemp -d -p "$(pwd)")" && \ - # output_dir="${work_dir}/output" && \ - # mkdir -p "${output_dir}" && \ - # data_dir="${work_dir}/data" && \ - # mkdir -p "${data_dir}" && \ - # cp /home/TestData/nlp/token_classification_punctuation/*.txt "${data_dir}" && \ - # conf_name=punctuation_capitalization_config_with_ids && \ - # cp conf/punctuation_capitalization_config.yaml "${work_dir}/${conf_name}.yaml" && \ - # sed -i $\'s/punct_label_ids: null/punct_label_ids: {O: 0, \\\',\\\': 1, .: 2, \\\'?\\\': 3}/\' \ - # "${work_dir}/${conf_name}.yaml" && \ - # sed -i $\'s/capit_label_ids: null/capit_label_ids: {O: 0, U: 1}/\' \ - # "${work_dir}/${conf_name}.yaml" && \ - # python punctuation_capitalization_train_evaluate.py \ - # --config-path "${work_dir}" \ - # --config-name "${conf_name}" \ - # model.train_ds.use_tarred_dataset=false \ - # model.train_ds.ds_item="${data_dir}" \ - # model.validation_ds.ds_item="${data_dir}" \ - # model.test_ds.ds_item="${data_dir}" \ - # model.language_model.pretrained_model_name=distilbert-base-uncased \ - # +model.train_ds.use_cache=false \ - # +model.validation_ds.use_cache=false \ - # +model.test_ds.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.strategy=ddp \ - # trainer.max_epochs=1 \ - # +exp_manager.explicit_log_dir="${output_dir}" \ - # +do_testing=false && \ - # python punctuation_capitalization_train_evaluate.py \ - # +do_training=false \ - # +do_testing=true \ - # ~model.train_ds \ - # ~model.validation_ds \ - # model.test_ds.ds_item="${data_dir}" \ - # pretrained_model="${output_dir}/checkpoints/Punctuation_and_Capitalization.nemo" \ - # +model.train_ds.use_cache=false \ - # +model.validation_ds.use_cache=false \ - # +model.test_ds.use_cache=false \ - # trainer.devices=[0,1] \ - # trainer.strategy=ddp \ - # trainer.max_epochs=1 \ - # exp_manager=null && \ - # rm -rf "${work_dir}" - - # Punctuation & Capitalization inference - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - output_dir="$(mktemp -d -p "$(pwd)")" && \ - python examples/nlp/token_classification/punctuate_capitalize_infer.py \ - --input_manifest /home/TestData/nlp/token_classification_punctuation/iwslt_tst2019.manifest \ - --output_text "${output_dir}/iwslt_inference_result.txt" \ - --max_seq_length 92 \ - --step 8 \ - --margin 16 \ - --pretrained_name punctuation_en_bert \ - --batch_size 32; - rm -rf "${output_dir}" # L2: Parallel Pretraining BERT pretraining from Text/Preprocessed L2_Pretraining_BERT_pretraining_from_Text: @@ -1947,23 +1191,6 @@ jobs: #rm -rf examples/nlp/language_modeling/PretrainingBERTFromPreprocessed - # L2: Entity Linking - L2_Entity_Linking_Self_Alignment_Pretraining_BERT: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - cd examples/nlp/entity_linking && \ - python self_alignment_pretraining.py \ - project_dir=. \ - trainer.val_check_interval=3 \ - model.raw_data=None \ - model.train_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_train_pairs.tsv \ - model.validation_ds.data_file=/home/TestData/nlp/entity_linking/tiny_example_validation_pairs.tsv \ - model.train_ds.batch_size=8 \ - model.validation_ds.batch_size=8 \ - exp_manager.exp_dir=null # TODO: remove +model.optim.capturable=True when Pytorch fix: https://github.com/pytorch/pytorch/pull/81858 # is in the release container @@ -2581,211 +1808,250 @@ jobs: L2_Megatron_Bert_Pretraining_and_Resume_Training_with_Pipeline_Parallelism: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.pipeline_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings L2_Megatron_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FUSED_ATTN=0 NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.precision=bf16 \ + model.megatron_amp_O2=True \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_Core_Bert_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=10 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.sequence_parallel=True \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - - NVTE_FLASH_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=10 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=20 \ - trainer.precision=32 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.mcore_bert=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method='block' \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/bert_pretrain_results - rm -rf examples/nlp/language_modeling/bert_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=10 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.sequence_parallel=True \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=0 python examples/nlp/language_modeling/megatron_bert_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=10 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=20 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/bert_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.mcore_bert=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_bert/data/bert/vocab.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method='block' \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence,.5,/home/TestData/nlp/megatron_bert/data/bert/simple_wiki_bert_preproc_text_sentence] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/bert_index_mappings + + rm -rf examples/nlp/language_modeling/bert_pretrain_results + rm -rf examples/nlp/language_modeling/bert_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_RETRO_Pretraining_and_Resume_Training: needs: [cicd-test-container-setup] @@ -3086,168 +2352,189 @@ jobs: L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=6 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - exp_manager.resume_if_exists=True \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=2 \ - model.optim.sched.constant_steps=2 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings - - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=rope \ - model.rotary_percentage=0.5 \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - # trainer.devices=2 \ - # trainer.accelerator=gpu \ - # trainer.log_every_n_steps=1 \ - # trainer.val_check_interval=2 \ - # trainer.limit_val_batches=1 \ - # trainer.accumulate_grad_batches=1 \ - # trainer.max_steps=6 \ - # trainer.precision=16 \ - # trainer.gradient_clip_val=1.0 \ - # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - # exp_manager.resume_if_exists=True \ - # model.tensor_model_parallel_size=2 \ - # model.optim.name=fused_adam \ - # model.optim.lr=2e-4 \ - # model.optim.sched.warmup_steps=2 \ - # model.optim.sched.constant_steps=2 \ - # model.optim.sched.min_lr=8e-5 \ - # model.max_position_embeddings=128 \ - # model.encoder_seq_length=128 \ - # model.data.seq_length=128 \ - # model.position_embedding_type=rope \ - # model.rotary_percentage=0.5 \ - # model.normalization=rmsnorm \ - # model.bias=False \ - # model.bias_activation_fusion=False \ - # model.bias_dropout_add_fusion=False \ - # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - # model.num_layers=8 \ - # model.hidden_size=256 \ - # model.num_attention_heads=8 \ - # model.activations_checkpoint_method=block \ - # model.activations_checkpoint_granularity=full \ - # model.activations_checkpoint_num_layers=1 \ - # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=6 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + exp_manager.resume_if_exists=True \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=2 \ + model.optim.sched.constant_steps=2 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + + L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2: + needs: [cicd-test-container-setup] + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=rope \ + model.rotary_percentage=0.5 \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + # python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + # trainer.devices=2 \ + # trainer.accelerator=gpu \ + # trainer.log_every_n_steps=1 \ + # trainer.val_check_interval=2 \ + # trainer.limit_val_batches=1 \ + # trainer.accumulate_grad_batches=1 \ + # trainer.max_steps=6 \ + # trainer.gradient_clip_val=1.0 \ + # exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + # exp_manager.resume_if_exists=True \ + # model.tensor_model_parallel_size=2 \ + # model.optim.name=fused_adam \ + # model.optim.lr=2e-4 \ + # model.optim.sched.warmup_steps=2 \ + # model.optim.sched.constant_steps=2 \ + # model.optim.sched.min_lr=8e-5 \ + # model.max_position_embeddings=128 \ + # model.encoder_seq_length=128 \ + # model.data.seq_length=128 \ + # model.position_embedding_type=rope \ + # model.rotary_percentage=0.5 \ + # model.normalization=rmsnorm \ + # model.bias=False \ + # model.bias_activation_fusion=False \ + # model.bias_dropout_add_fusion=False \ + # model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + # model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + # model.num_layers=8 \ + # model.hidden_size=256 \ + # model.num_attention_heads=8 \ + # model.activations_checkpoint_method=block \ + # model.activations_checkpoint_granularity=full \ + # model.activations_checkpoint_num_layers=1 \ + # model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + # model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" # This test requires Ampere but some of the test GPUs are Volta # Need to add a check for compute capability before uncommenting this test @@ -3343,169 +2630,192 @@ jobs: L2_Megatron_GPT_with_ALiBi_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=alibi \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # not testing resume functionality to save time on ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=alibi \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=alibi \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # not testing resume functionality to save time on ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=alibi \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_with_KERPLE_Pretraining_and_Resume_Training_TP2: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure - SCRIPT: | - python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - trainer.devices=2 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=2 \ - trainer.limit_val_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=3 \ - trainer.precision=16 \ - trainer.gradient_clip_val=1.0 \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - model.tensor_model_parallel_size=2 \ - model.optim.name=fused_adam \ - model.optim.lr=2e-4 \ - model.optim.sched.warmup_steps=1 \ - model.optim.sched.constant_steps=1 \ - model.optim.sched.min_lr=8e-5 \ - model.max_position_embeddings=128 \ - model.encoder_seq_length=128 \ - model.data.seq_length=128 \ - model.position_embedding_type=kerple \ - model.normalization=rmsnorm \ - model.bias=False \ - model.bias_activation_fusion=False \ - model.bias_dropout_add_fusion=False \ - model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - model.num_layers=8 \ - model.hidden_size=256 \ - model.num_attention_heads=8 \ - model.activations_checkpoint_method=block \ - model.activations_checkpoint_granularity=full \ - model.activations_checkpoint_num_layers=1 \ - model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings - - # commented out to save time on github ci @adithyare - #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ - #trainer.devices=2 \ - #trainer.accelerator=gpu \ - #trainer.log_every_n_steps=1 \ - #trainer.val_check_interval=2 \ - #trainer.limit_val_batches=1 \ - #trainer.accumulate_grad_batches=1 \ - #trainer.max_steps=6 \ - #trainer.precision=16 \ - #trainer.gradient_clip_val=1.0 \ - #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ - #exp_manager.resume_if_exists=True \ - #model.tensor_model_parallel_size=2 \ - #model.optim.name=fused_adam \ - #model.optim.lr=2e-4 \ - #model.optim.sched.warmup_steps=2 \ - #model.optim.sched.constant_steps=2 \ - #model.optim.sched.min_lr=8e-5 \ - #model.max_position_embeddings=128 \ - #model.encoder_seq_length=128 \ - #model.data.seq_length=128 \ - #model.position_embedding_type=kerple \ - #model.normalization=rmsnorm \ - #model.bias=False \ - #model.bias_activation_fusion=False \ - #model.bias_dropout_add_fusion=False \ - #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ - #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ - #model.num_layers=8 \ - #model.hidden_size=256 \ - #model.num_attention_heads=8 \ - #model.activations_checkpoint_method=block \ - #model.activations_checkpoint_granularity=full \ - #model.activations_checkpoint_num_layers=1 \ - #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ - #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_pretrain_results - rm -rf examples/nlp/language_modeling/gpt_index_mappings + runs-on: self-hosted-azure + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + trainer.devices=2 \ + trainer.accelerator=gpu \ + trainer.log_every_n_steps=1 \ + trainer.val_check_interval=2 \ + trainer.limit_val_batches=2 \ + trainer.accumulate_grad_batches=1 \ + trainer.max_steps=3 \ + trainer.gradient_clip_val=1.0 \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + model.tensor_model_parallel_size=2 \ + model.optim.name=fused_adam \ + model.optim.lr=2e-4 \ + model.optim.sched.warmup_steps=1 \ + model.optim.sched.constant_steps=1 \ + model.optim.sched.min_lr=8e-5 \ + model.max_position_embeddings=128 \ + model.encoder_seq_length=128 \ + model.data.seq_length=128 \ + model.position_embedding_type=kerple \ + model.bias=False \ + model.bias_activation_fusion=False \ + model.bias_dropout_add_fusion=False \ + model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + model.num_layers=8 \ + model.hidden_size=256 \ + model.num_attention_heads=8 \ + model.activations_checkpoint_method=block \ + model.activations_checkpoint_granularity=full \ + model.activations_checkpoint_num_layers=1 \ + model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings + + # commented out to save time on github ci @adithyare + #python examples/nlp/language_modeling/megatron_gpt_pretraining.py \ + #trainer.devices=2 \ + #trainer.accelerator=gpu \ + #trainer.log_every_n_steps=1 \ + #trainer.val_check_interval=2 \ + #trainer.limit_val_batches=1 \ + #trainer.accumulate_grad_batches=1 \ + #trainer.max_steps=6 \ + #trainer.precision=16 \ + #trainer.gradient_clip_val=1.0 \ + #exp_manager.exp_dir=examples/nlp/language_modeling/gpt_pretrain_results \ + #exp_manager.resume_if_exists=True \ + #model.tensor_model_parallel_size=2 \ + #model.optim.name=fused_adam \ + #model.optim.lr=2e-4 \ + #model.optim.sched.warmup_steps=2 \ + #model.optim.sched.constant_steps=2 \ + #model.optim.sched.min_lr=8e-5 \ + #model.max_position_embeddings=128 \ + #model.encoder_seq_length=128 \ + #model.data.seq_length=128 \ + #model.position_embedding_type=kerple \ + #model.normalization=rmsnorm \ + #model.bias=False \ + #model.bias_activation_fusion=False \ + #model.bias_dropout_add_fusion=False \ + #model.tokenizer.vocab_file=/home/TestData/nlp/megatron_gpt/data/gpt/vocab.json \ + #model.tokenizer.merge_file=/home/TestData/nlp/megatron_gpt/data/gpt/merges.txt \ + #model.num_layers=8 \ + #model.hidden_size=256 \ + #model.num_attention_heads=8 \ + #model.activations_checkpoint_method=block \ + #model.activations_checkpoint_granularity=full \ + #model.activations_checkpoint_num_layers=1 \ + #model.data.data_prefix=[.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document,.5,/home/TestData/nlp/megatron_gpt/data/gpt/simple_wiki_gpt_preproc_text_document] \ + #model.data.index_mapping_dir=examples/nlp/language_modeling/gpt_index_mappings" + + rm -rf examples/nlp/language_modeling/gpt_pretrain_results + rm -rf examples/nlp/language_modeling/gpt_index_mappings + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" L2_Megatron_GPT_Pretraining_and_Resume_Training_PP2: needs: [cicd-test-container-setup] @@ -3663,36 +2973,50 @@ jobs: L2_Megatron_GPT_Finetuning_StarCoder_PP1: needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ - trainer.devices=1 \ - trainer.num_nodes=1 \ - trainer.precision=32 \ - trainer.max_steps=4 \ - trainer.val_check_interval=4 \ - trainer.enable_checkpointing=False \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - exp_manager.checkpoint_callback_params.save_best_model=False \ - exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ - model.peft.peft_scheme=none \ - model.optim.name=distributed_fused_adam \ - model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ - model.tensor_model_parallel_size=1 \ - model.pipeline_model_parallel_size=1 \ - model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.train_ds.num_workers=0 \ - model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.validation_ds.num_workers=0 \ - model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ - model.data.test_ds.num_workers=0 \ - model.data.train_ds.concat_sampling_probabilities=[1.0] - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/gpt_sft_results - + runs-on: self-hosted-azure-gpus-1 + timeout-minutes: 10 + container: + image: nemoci.azurecr.io/nemo_container_${{ github.run_id }} + options: + # --user 0:128 + --device=/dev/nvidia0 + --gpus all + --shm-size=8g + --env TRANSFORMERS_OFFLINE=0 + --env HYDRA_FULL_ERROR=1 + --volume /mnt/datadrive/TestData:/home/TestData + steps: + - name: Checkout repository + uses: actions/checkout@v4 + - run: | + python examples/nlp/language_modeling/tuning/megatron_gpt_finetuning.py \ + trainer.devices=1 \ + trainer.num_nodes=1 \ + trainer.precision=bf16 \ + trainer.max_steps=4 \ + trainer.val_check_interval=4 \ + trainer.enable_checkpointing=False \ + +trainer.limit_val_batches=2 \ + +trainer.limit_test_batches=2 \ + exp_manager.checkpoint_callback_params.save_best_model=False \ + exp_manager.exp_dir=examples/nlp/language_modeling/gpt_sft_results \ + model.peft.peft_scheme=none \ + model.optim.name=distributed_fused_adam \ + model.restore_from_path=/home/TestData/nlp/megatron_gpt/starcoder-ci-nemo/megatron_starcoder_tp1_pp1.nemo \ + model.tensor_model_parallel_size=1 \ + model.pipeline_model_parallel_size=1 \ + model.data.train_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.train_ds.num_workers=0 \ + model.data.test_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.validation_ds.num_workers=0 \ + model.data.validation_ds.file_names=[/home/TestData/nlp/megatron_sft/quarel.jsonl] \ + model.data.test_ds.num_workers=0 \ + model.data.train_ds.concat_sampling_probabilities=[1.0] + + rm -rf examples/nlp/language_modeling/gpt_sft_results + - uses: "NVIDIA/NeMo/.github/actions/cancel-workflow@main" + if: "failure()" + L2_Megatron_GPT_Embedding: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4545,75 +3869,7 @@ jobs: AFTER_SCRIPT: | rm -rf examples/nlp/language_modeling/bart_pretrain_results - # L2: Megatron T5 GLUE/XNLI Finetuning - # TODO(Oktai15): update it in 1.8.0 version - L2_Megatron_T5_GLUE_RTE: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_glue_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=rte \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/dev_ci.tsv - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_glue_results - - L2_Megatron_T5_GLUE_XNLI: - needs: [cicd-test-container-setup] - uses: ./.github/workflows/_test_template.yml - with: - RUNNER: self-hosted-azure-gpus-1 - SCRIPT: | - python examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py \ - -cn megatron_t5_config_finetune_glue_xnli \ - trainer.devices=1 \ - trainer.accelerator=gpu \ - trainer.log_every_n_steps=1 \ - trainer.val_check_interval=1 \ - +trainer.limit_val_batches=2 \ - +trainer.limit_test_batches=2 \ - trainer.accumulate_grad_batches=1 \ - trainer.max_steps=2 \ - trainer.precision=16 \ - exp_manager.exp_dir=examples/nlp/language_modeling/t5_xnli_results \ - model.restore_from_path=/home/TestData/nlp/megatron_t5/8m/megatron_t5_8m-refactor.nemo \ - model.pipeline_model_parallel_size=1 \ - model.pipeline_model_parallel_split_rank=0 \ - model.data.train_ds.global_batch_size=4 \ - model.data.train_ds.micro_batch_size=2 \ - model.data.validation_ds.global_batch_size=2 \ - model.data.validation_ds.micro_batch_size=2 \ - model.data.test_ds.global_batch_size=2 \ - model.data.test_ds.micro_batch_size=2 \ - model.data.train_ds.task_name=rte \ - model.data.train_ds.file_path=/home/TestData/nlp/megatron_t5/data/train_ci.tsv \ - model.data.validation_ds.task_name=xnli \ - model.data.validation_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv \ - model.data.test_ds.task_name=xnli \ - model.data.test_ds.file_path=/home/TestData/nlp/megatron_t5/data/xnli_dev_ci.tsv - AFTER_SCRIPT: | - rm -rf examples/nlp/language_modeling/t5_xnli_results - + L2_Megatron_T5_PEFT_Lora_TP2: needs: [cicd-test-container-setup] uses: ./.github/workflows/_test_template.yml @@ -4941,23 +4197,7 @@ jobs: - L2_Segmentation_Tool_Parallel_ctc_segmentation_test_L2_Ru_QN_with_mp3 - L2_G2P_Models_G2P_Conformer_training_evaluation_and_inference - L2_G2P_Models_HeteronymClassificationModel_training_evaluation_and_inference - - L2_Dialogue_Classification_Intent_and_slot_classification_using_SGDQA - - L2_Dialogue_Classification_Intent_and_slot_classification_using_IntentSlotClassificationModel - - L2_Dialogue_Classification_Intent_classification_using_ZeroShotIntentModel - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel - - L2_Dialogue_Classification_Design_Intent_classification_using_ZeroShotIntentModel_BART_Classifier - - L2_Dialogue_Classification_Design_Intent_classification_using_DialogueNearestNeighbourModel - - L2_Dialogue_Generation_Dialogue_Answer_Extender_using_DialogueS2SGenerationModel - - L2_Dialogue_Generation_Dialogue_SGD_Based_Answer_Extender_using_DialogueS2SGenerationModel - - L2_COPY_Dialogue_Answer_Extender_using_DialogueGPTGenerationModel - L2_Duplex_Text_Normalization_with_Tarred_dataset - - L2_BERT_Text_Classification_with_BERT_Test - - L2_Parallel_BERT_Question-Answering_SQUAD_v1_1 - - L2_Parallel_BERT_Question-Answering_SQUAD_v2_0 - - L2_Parallel_BART_Question-Answering_SQUAD_v1_1 - - L2_Parallel_BART_Question-Answering_SQUAD_v2_0 - - L2_Parallel_GPT2_Question-Answering_SQUAD_v1_1 - - L2_Parallel_GPT2_Question-Answering_SQUAD_v2_0 - L2_Intent_and_Slot_Classification_Tasks_Intent_and_Slot_Classification - L2_Intent_and_Slot_Classification_Tasks_Multi-Label_Intent_and_Slot_Classification - L2_Parallel_NLP_Examples2_NER_finetuning_from_pretrained_Test @@ -4965,13 +4205,8 @@ jobs: - L2_Parallel_NLP_Examples2_NER_with_TurkuNLP__bert-base-finnish-cased-v1 - L2_Parallel_NLP_Examples2_Evaluation_script_for_Token_Classification - L2_Parallel_NLP_Examples2_Evaluation_script_for_Punctuation - - L2_Parallel_NLP_Examples2_Punctuation_Capitalization_2GPUs_with_DistilBERT_Finetuning_on_other_data - - Punctuation_Capitalization_tarred_dataset_create_and_use_tarred_dataset - - Punctuation_Capitalization_Using_model-common_datasets_parameters-label_vocab_dir - - Punctuation_Capitalization_inference_Restore_punctuation_and_capitalization_in_long_text - L2_Pretraining_BERT_pretraining_from_Text - L2_Pretraining_BERT_from_Preprocessed - - L2_Entity_Linking_Self_Alignment_Pretraining_BERT - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Post-LN - L2_NMT_Attention_is_All_You_Need_Training_NMT_Training_Pre-LN - L2_NMT_Attention_is_All_You_Need_Training_NMT_Multi-Validation @@ -5013,8 +4248,6 @@ jobs: - L2_Megatron_T5_Eval - L2_Megatron_BART_Pretraining_and_Resume_Training_TP2 - L2_Megatron_BART_Pretraining_and_Resume_Training_PP2 - - L2_Megatron_T5_GLUE_RTE - - L2_Megatron_T5_GLUE_XNLI - L2_Megatron_T5_PEFT_Lora_TP2 - L2_Megatron_Mock_Data_Generation_MockGPTDataset - L2_Megatron_Mock_Data_Generation_MockT5Dataset diff --git a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml index bc66ae717ebb3..4eef38e715d45 100644 --- a/examples/nlp/language_modeling/conf/megatron_bert_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_bert_config.yaml @@ -5,7 +5,7 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False @@ -41,7 +41,7 @@ exp_manager: model: # model parallelism - mcore_bert: False + mcore_bert: True micro_batch_size: 4 global_batch_size: 8 tensor_model_parallel_size: 1 @@ -85,7 +85,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 grad_div_ar_fusion: False @@ -158,4 +158,4 @@ model: name: CosineAnnealing warmup_steps: 500 constant_steps: 50000 - min_lr: 2e-5 \ No newline at end of file + min_lr: 2e-5 diff --git a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml index ca0c3f74e4c83..1f63f7742ea06 100755 --- a/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml +++ b/examples/nlp/language_modeling/conf/megatron_gpt_config.yaml @@ -9,7 +9,7 @@ trainer: devices: 1 num_nodes: 1 accelerator: gpu - precision: 16 + precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False use_distributed_sampler: False @@ -56,7 +56,7 @@ exp_manager: model: # use GPTModel from megatron.core - mcore_gpt: False + mcore_gpt: True # specify micro_batch_size, global_batch_size, and model parallelism # gradient accumulation will be done automatically based on data_parallel_size @@ -121,7 +121,7 @@ model: fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 # Megatron O2-style half-precision - megatron_amp_O2: False # Enable O2-level automatic mixed precision using main parameters + megatron_amp_O2: True # Enable O2-level automatic mixed precision using main parameters grad_allreduce_chunk_size_mb: 125 # Fusion diff --git a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py index 98d24802189e6..92c56a4c20df6 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/assistant_data_processor.py @@ -17,6 +17,7 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueAssistantDataProcessor'] @@ -31,6 +32,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg): data_dir: path to data directory tokenizer: tokenizer object """ + # deprecation warning + deprecated_warning("DialogueAssistantDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -69,16 +73,15 @@ def open_file(self, filename): @staticmethod def get_continuous_slots(slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids): - """ Extract continuous spans of slot_ids - To accomodate slots with distinct labels for B-label1 and I-label1, + To accomodate slots with distinct labels for B-label1 and I-label1, slot_id = self.bio_slot_ids_to_unified_slot_ids[slot_id] is called to map them both to label1 - + Args: Slot: list of int representing slot of each word token - For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 + For instance, 54 54 54 54 54 54 54 54 18 54 44 44 54 46 46 54 12 Corresponds to "please set an alarm clock for my next meeting with the team at three pm next friday" Except for the empty_slot_id (54 in this case), we hope to extract the continuous spans of tokens, each containing a start position and an exclusive end position @@ -124,7 +127,7 @@ def map_bio_format_slots_to_unified_slots(slots): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the assistant dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. @@ -177,7 +180,11 @@ def get_dialog_examples(self, dataset_split: str): "labels": {"service": intent.split('_')[0], "intent": intent, "slots": slot_to_words}, "label_positions": { "slots": { - slot: {"start": position[0], "exclusive_end": position[1], "slot": slot,} + slot: { + "start": position[0], + "exclusive_end": position[1], + "slot": slot, + } for slot, position in slot_to_start_and_exclusive_end.items() } }, diff --git a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py index 2a4b21c705353..c41c1f5e04ca2 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/data_processor.py @@ -17,6 +17,7 @@ import random from nemo.collections.nlp.data.data_utils.data_preprocessing import DataProcessor +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueDataProcessor'] @@ -40,6 +41,9 @@ class DialogueDataProcessor(DataProcessor): """ def __init__(self): + # deprecation warning + deprecated_warning("DialogueDataProcessor") + raise NotImplementedError() def get_train_examples(self): @@ -58,8 +62,8 @@ def get_test_examples(self): def get_relevant_idxs(dataset_split, n_samples, dev_proportion): """ Obtain indexes for each dataset_split, when train and dev sets are not in separate files - - Args: + + Args: dataset_split: train, dev or test n_samples: total number of samples dev_proportion: value from 1 to 99 that represent proportion of data in dev set diff --git a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py index 5e58919b76522..56e99c4bcfe91 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/design_data_processor.py @@ -19,6 +19,7 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueDesignDataProcessor'] @@ -34,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueDesignDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -50,7 +54,7 @@ def open_csv(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} Dev set contains self.cfg.dev_proportion % of samples with the rest going into the train set Test set contains the whole dataset (Dev + Train) as this dataset is small (~100) and primarily used in a zero shot setting diff --git a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py index 58814a8eee90d..67d58ff5d21ec 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/mellon_qa_data_processor.py @@ -19,13 +19,13 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueMellonQADataProcessor'] class DialogueMellonQADataProcessor(DialogueDataProcessor): - """Data Processor for Mellon QA dialogues. - """ + """Data Processor for Mellon QA dialogues.""" def __init__(self, data_dir: str, tokenizer: object, cfg=None): """ @@ -35,6 +35,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueMellonQADataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -51,7 +54,7 @@ def open_csv(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the Mellon QA dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. @@ -82,7 +85,11 @@ def get_dialog_examples(self, dataset_split: str): input_example = { "utterance": utterance, "example_id": i, - "labels": {"response": answer, "fluent_response": well_formed_answer, "passage": passage,}, + "labels": { + "response": answer, + "fluent_response": well_formed_answer, + "passage": passage, + }, } example = DialogueInputExample(input_example) examples.append(example) diff --git a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py index 78f434c1d5dda..d09960a35d690 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/ms_marco_data_processor.py @@ -19,15 +19,16 @@ from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueMSMarcoDataProcessor'] class DialogueMSMarcoDataProcessor(DialogueDataProcessor): """Data Processor for MS Marco dialogues. (https://github.com/microsoft/MSMARCO-Question-Answering) - Please agree to the Terms of Use before downloading data at - https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz - https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz + Please agree to the Terms of Use before downloading data at + https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz + https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz """ def __init__(self, data_dir: str, tokenizer: object, cfg=None): @@ -39,6 +40,9 @@ def __init__(self, data_dir: str, tokenizer: object, cfg=None): debug_mode: reduce number of samples to load in order to increase speed of processing cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueMSMarcoDataProcessor") + self.data_dir = data_dir self._tokenizer = tokenizer self.cfg = cfg @@ -55,7 +59,7 @@ def open_json(self, filename): def get_dialog_examples(self, dataset_split: str): """ Process raw files into DialogueInputExample - Args: + Args: dataset_split: {train, dev, test} For the MS Marco dataset, there is no explicit dev set (instead uses the test set as the dev set) Therefore, this function creates a dev set and a new train set from the train set. diff --git a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py index a78e1973e55fa..1d37c26f1c45e 100644 --- a/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py +++ b/nemo/collections/nlp/data/dialogue/data_processor/sgd_data_processor.py @@ -28,6 +28,7 @@ from nemo.collections.nlp.data.dialogue.input_example.input_example import DialogueInputExample from nemo.collections.nlp.data.dialogue.sgd.schema import Schema from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning from nemo.utils.get_rank import is_global_rank_zero __all__ = ['DialogueSGDDataProcessor'] @@ -51,7 +52,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): # git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git ***Data format*** - SGD data comes with a JSON schema file and dialogue files for each dataset split. + SGD data comes with a JSON schema file and dialogue files for each dataset split. In the following we will show an example for a service entry in the schema file. * service_name @@ -70,7 +71,7 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): * result_slots (not used) - In the following we will show an example for a dialogue. + In the following we will show an example for a dialogue. * dialogue_id * services * turns @@ -87,14 +88,18 @@ class DialogueSGDDataProcessor(DialogueDataProcessor): * state * active_intent * requeste_slots - * slot_values + * slot_values * speaker - [USER, SYSTEM] * utterance """ def __init__( - self, data_dir: str, dialogues_example_dir: str, tokenizer: object, cfg=None, + self, + data_dir: str, + dialogues_example_dir: str, + tokenizer: object, + cfg=None, ): """ Constructs DialogueSGDDataProcessor @@ -104,6 +109,9 @@ def __init__( tokenizer: tokenizer object cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueSGDDataProcessor") + self.data_dir = data_dir self.cfg = cfg @@ -213,7 +221,7 @@ def get_labels(self): def get_dialog_examples(self, dataset_split: str) -> List[object]: """ - Loads preprocessed dialogue examples from disk. + Loads preprocessed dialogue examples from disk. Args: dataset_split: dataset split Returns: @@ -260,7 +268,7 @@ def _generate_dialog_examples(self, dataset_split: str, schemas: object, subsamp Returns a list of `InputExample`s of the data splits' dialogues. Args: dataset_split: data split, can be "train", "dev", or "test". - schemas: schema for all services of all datasets + schemas: schema for all services of all datasets subsample: whether to balance postive and negative samples in the dataset Returns: examples: a list of `InputExample`s. @@ -447,9 +455,9 @@ def _create_examples_from_turn( "example_id_num": example_id_num, "utterance": user_utterance, "system_utterance": system_utterance, - "system_slots": {slot["slot"]: slot for slot in system_frame["slots"]} - if system_frame is not None - else None, + "system_slots": ( + {slot["slot"]: slot for slot in system_frame["slots"]} if system_frame is not None else None + ), "system_actions": system_frame["actions"] if system_frame is not None else None, "labels": { "service": service, @@ -464,9 +472,11 @@ def _create_examples_from_turn( for intent in schemas.get_service_schema(service).intents ], "slots": { - slot: schemas.get_service_schema(service).get_categorical_slot_values(slot) - if slot in categorical_slots - else [] + slot: ( + schemas.get_service_schema(service).get_categorical_slot_values(slot) + if slot in categorical_slots + else [] + ) for slot in all_possible_slots }, }, diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py index 0931fe383f943..33d46c308e810 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_bert_dataset.py @@ -21,12 +21,12 @@ from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset from nemo.core.neural_types import ChannelType, LabelsType, MaskType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueBERTDataset', 'DialogueIntentSlotInferenceDataset'] class DialogueBERTDataset(DialogueDataset): - """ Creates a dataset to use for the task of joint intent and slot classification with pretrained model. @@ -37,8 +37,7 @@ class DialogueBERTDataset(DialogueDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), @@ -57,6 +56,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: config container for dataset """ + # deprecation warning + deprecated_warning("DialogueBERTDataset") + self.cfg = cfg self.all_possible_labels = dialogues_processor.intents self.label_to_label_id = {self.all_possible_labels[i]: i for i in range(len(self.all_possible_labels))} @@ -183,7 +185,7 @@ def get_features( ignore_start_end=False, ): """ - Convert queries (utterance, intent label and slot labels) to BERT input format + Convert queries (utterance, intent label and slot labels) to BERT input format """ all_subtokens = [] @@ -297,7 +299,7 @@ class DialogueIntentSlotInferenceDataset(DialogueBERTDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: """ - Returns definitions of module output ports. + Returns definitions of module output ports. """ return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), @@ -308,6 +310,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: } def __init__(self, queries, max_seq_length, tokenizer, do_lower_case): + # deprecation warning + deprecated_warning("DialogueIntentSlotInferenceDataset") + if do_lower_case: queries = [query.lower() for query in queries] diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py index 1ac04a856a89c..f89a5013c2ae6 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_classification_dataset.py @@ -21,27 +21,31 @@ from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class DialogueGPTClassificationDataset(DialogueDataset): ''' Designed for classification tasks such as intent/domain classification as well as slot tagging - Dataset Class + Dataset Class 1. Performs Model-dependent (but Data-independent) operations (tokenization etc) 2. This can allow the same model preprocessing for multiple datasources - 3. Users can configurate which labels to use for modelling + 3. Users can configurate which labels to use for modelling (e.g. intent classification, slot filling or both together etc) ''' def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor + """Constructor Args: dataset_split: dataset split dialogues_processor: Data generator for SGD dialogues tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueGPTClassificationDataset") + self.cfg = cfg if self.cfg.target_template == "with_slots" and self.cfg.eval_mode != "generation": @@ -229,19 +233,18 @@ def collate_fn(self, batch): return all_items def __getitem__(self, idx: int): - ''' State how the input and output samples look like This template can be changed - Training example: + Training example: e.g. service: restaurant e.g. service: restaurant e.g. \nintent: set alarm\nslots: (), () Generation example: - e.g. service: + e.g. service: ''' ex = self.features[idx].data diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py index 7de02d75c5744..8ddbc2e3925e4 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_gpt_generation_dataset.py @@ -18,12 +18,13 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning class DialogueGPTGenerationDataset(DialogueDataset): def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation + """Constructor + Designed for free form generation tasks such as Dialogue Response Generation Args: dataset_split: dataset split @@ -31,6 +32,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueGPTGenerationDataset") + self.cfg = cfg self.input_label_type = self.cfg.input_field self.output_label_type = self.cfg.output_field @@ -80,7 +84,7 @@ def format_prompt(self, ex): ''' Formats training prompt based on self.input_field_type - Training example: + Training example: e.g. response: # input_label_type = response e.g. utterance: # input_label_type = utterance e.g. passage: utterance: # input_label_type = passage+utterance @@ -91,7 +95,6 @@ def format_prompt(self, ex): return input_sentence def __getitem__(self, idx: int): - ''' For each example, this function determines the format of input and output sequences based on user-specified conguration. This is controlled by model.dataset.input_field and model.dataset.output_field @@ -99,9 +102,9 @@ def __getitem__(self, idx: int): If model.dataset.input_field == response and model.dataset.output_field == fluent_response: Input = "response: " and output = "response: fluent_response: " (with loss calculated from only) If model.dataset.input_field == utterance and model.dataset.output_field == response: - Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) + Input = "utterance: " and output = "utterance: response: " (with loss calculated from only) If model.dataset.input_field == passage+utterance and model.dataset.output_field == response: - Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) + Input = "passage: utterance: " and output="passage: utterance: response: " (with loss calculated from only) ''' ex = self.features[idx].data diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py index 8618f2f8c7b4b..dc123ca0e3d73 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_nearest_neighbour_dataset.py @@ -17,6 +17,7 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueNearestNeighbourDataset'] @@ -33,6 +34,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c dialogues_processor: Data generator for dialogues tokenizer: tokenizer to split text into sub-word tokens """ + # deprecation warning + deprecated_warning("DialogueNearestNeighbourDataset") + self.cfg = cfg self.tokenizer = tokenizer self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py index 78fda55edd2ea..df522b74e8614 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_s2s_generation_dataset.py @@ -16,12 +16,13 @@ import torch from nemo.collections.nlp.data.dialogue.dataset.dialogue_dataset import DialogueDataset +from nemo.utils.decorators import deprecated_warning class DialogueS2SGenerationDataset(DialogueDataset): def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, cfg): - """ Constructor - Designed for free form generation tasks such as Dialogue Response Generation + """Constructor + Designed for free form generation tasks such as Dialogue Response Generation Args: dataset_split: dataset split @@ -29,6 +30,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c tokenizer: tokenizer cfg: cfg container for dataset """ + # deprecation warning + deprecated_warning("DialogueS2SGenerationDataset") + self.cfg = cfg self.input_label_type = self.cfg.input_field self.output_label_type = self.cfg.output_field @@ -45,7 +49,7 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c @staticmethod def format_actions(prompt_template, actions): """ - Formats actions based on prompt_template + Formats actions based on prompt_template Args: prompt_template: determines whether acts, slot-names, slot-values are necessary in formatted actions @@ -118,7 +122,7 @@ def format_prompt(self, ex): ''' Formats training prompt based on self.input_field_type - Training example: + Training example: e.g. response: # input_label_type = response e.g. utterance: # input_label_type = utterance e.g. passage: utterance: # input_label_type = passage+utterance @@ -128,13 +132,12 @@ def format_prompt(self, ex): return input_sentence def __getitem__(self, idx: int): - ''' State how the input and output samples look like This template can be changed - Training example: + Training example: e.g. INPUT - "response: " OUTPUT - "" # input_label_type = response, output_label_type = fluent_response e.g. INPUT - "utterance: " OUTPUT - "" # input_label_type = utterance, output_label_type = response e.g. INPUT - "passage: utterance: " OUTPUT - "" # input_label_type = passage+utterance, output_label_type = response diff --git a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py index f2a0f58bcfac2..c1308238bea14 100644 --- a/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py +++ b/nemo/collections/nlp/data/dialogue/dataset/dialogue_zero_shot_intent_dataset.py @@ -23,6 +23,7 @@ from nemo.collections.nlp.data.glue_benchmark.glue_benchmark_dataset import GLUEDataset from nemo.core.neural_types import CategoricalValuesType, ChannelType, MaskType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueZeroShotIntentDataset'] @@ -36,8 +37,7 @@ class DialogueZeroShotIntentDataset(GLUEDataset): @property def output_types(self) -> Optional[Dict[str, NeuralType]]: - """Returns definitions of module output ports. - """ + """Returns definitions of module output ports.""" return { 'input_ids': NeuralType(('B', 'T'), ChannelType()), 'segment_ids': NeuralType(('B', 'T'), ChannelType()), @@ -55,6 +55,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c num_classes: number of classes in the data (should be either 2 or 3, corresponding to labels ['entailment', 'not_entailment'] or ["contradiction", "entailment", "neutral"]) """ + # deprecation warning + deprecated_warning("DialogueZeroShotIntentDataset") + self.cfg = cfg self.tokenizer = tokenizer if self.cfg.num_classes not in [2, 3]: @@ -69,9 +72,9 @@ def __init__(self, dataset_split: str, dialogues_processor: object, tokenizer, c 'eos_token': tokenizer.eos_token, 'pad_token': tokenizer.pad_token, 'cls_token': tokenizer.cls_token, - 'sep_token_extra': tokenizer.eos_token - if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() - else None, + 'sep_token_extra': ( + tokenizer.eos_token if hasattr(tokenizer, 'name') and 'roberta' in tokenizer.name.lower() else None + ), } self.raw_features = dialogues_processor.get_dialog_examples(dataset_split) @@ -128,9 +131,9 @@ def convert_examples_to_features( * True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] The `cls_token_segment_id` defines the segment id associated to the CLS token (0 for BERT, 2 for XLNet) - + The convention in BERT is: - + a. For sequence pairs: * tokens: [CLS] is this jack ##ville ? [SEP] no it is not . [SEP] * type_ids: 0 0 0 0 0 0 0 1 1 1 1 1 1 @@ -148,9 +151,9 @@ def convert_examples_to_features( For classification tasks, the first vector (corresponding to [CLS]) is used as as the "sentence vector". Note that this only makes sense because the entire model is fine-tuned. - + The convention for NMT is: - + a. For sequence pairs: * tokens: is this jack ##ville ? no it is not . * type_ids:0 0 0 0 0 0 0 1 1 1 1 1 1 1 diff --git a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py index 5d985466ff6cb..bbd14f47a6514 100644 --- a/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/base_prompt_learning_dataset.py @@ -17,6 +17,7 @@ from nemo.collections.nlp.modules.common import VirtualPromptSource from nemo.core import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['BasePromptLearningDataset'] @@ -41,6 +42,9 @@ def __init__( add_eos: bool = True, for_train: bool = True, ): + # deprecation warning + deprecated_warning("BasePromptLearningDataset") + self.tokenizer = tokenizer self.virtual_prompt_source = virtual_prompt_source self.task_templates = task_templates @@ -72,7 +76,7 @@ def __init__( raise ValueError("Datasets must be a list of dicts or a list of filepath strings") def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ + """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers""" total_inserted_tokens = 0 for idx in range(len(virtual_token_splits)): @@ -85,7 +89,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits return input_example def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virtual_tokens=0): - """ Try to truncate input text to fit into the max sequence length """ + """Try to truncate input text to fit into the max sequence length""" logging.info( f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" ) @@ -115,7 +119,7 @@ def _truncate_input(self, truncation_field, input_ids, taskname, doc, total_virt return input_ids def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ + """Add leading space to text if there is a space before it in the template""" prompt_template = self.task_templates[taskname]["prompt_template"] field_text_start = prompt_template.find("{" + field_name + "}") if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": @@ -187,11 +191,11 @@ def pad_taskname_ids(self, taskname_ids): def find_subsequence_location(sequence, subsequence): - """ Finds the start and end index of the first occurance - of a given subsequence within a larger list. Returns - the two indices corresponding to the postition of - the first and last token of the subseqeunce. - Assumes subsequence is known to be in sequence. + """Finds the start and end index of the first occurance + of a given subsequence within a larger list. Returns + the two indices corresponding to the postition of + the first and last token of the subseqeunce. + Assumes subsequence is known to be in sequence. """ assert len(sequence) >= len(subsequence), "subsequence too long" diff --git a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py index 4b1b4f61d4391..11795bd150f11 100755 --- a/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py +++ b/nemo/collections/nlp/data/language_modeling/megatron/gpt_prompt_learning_dataset.py @@ -23,6 +23,7 @@ from nemo.collections.nlp.modules.common.megatron.utils import build_position_ids from nemo.core import Dataset from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning __all__ = ['GPTPromptLearningDataset'] @@ -30,7 +31,7 @@ class GPTPromptLearningDataset(Dataset): """ The dataset class for prompt-tuning or p-tuning pretrained GPT models. - + Args: data (list[strings], list[dicts]): (1) paths to .jsonl or .json files, (2) dict objects corresponding to each input example tokenizer (tokenizer): Tokenizer from frozen language model @@ -39,7 +40,7 @@ class GPTPromptLearningDataset(Dataset): pseudo_tokens (list[strings]): A list of virtual prompt token placeholders e.g [, , ...] up to max num virtual tokens pad_token_id (int): ID of pad token from tokenizer max_seq_length (int): maximum sequence length for each dataset examples. Examples will either be truncated to fit this length or dropped if they cannot be truncated. - min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. + min_seq_length (int): min length of each data example in the dataset. Data examples will be dropped if they do not meet the min length requirements. add_bos (bool): Whether to add a beginning of sentence token to each data example add_eos (bool): Whether to add an end of sentence token to each data example for_train (bool): Whether you're creating a dataset for training or inference @@ -63,6 +64,9 @@ def __init__( cache_data_path: str = None, # the cache file load_cache: bool = True, # whether to load from the cache if it is available ): + # deprecation warning + deprecated_warning("GPTPromptLearningDataset") + self.tokenizer = tokenizer self.virtual_prompt_source = virtual_prompt_source self.task_templates = task_templates @@ -112,9 +116,9 @@ def __init__( def load_data(self, dataset): """ Loads a dataset by filling in the task templates specified in the config file - with the information from each training/inference example. Converts all input - text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in - the task templates with the actual virtual prompt token ids. + with the information from each training/inference example. Converts all input + text into token ids. Also replaces the <|VIRTUAL_PROMPT_#|> placeholders in + the task templates with the actual virtual prompt token ids. params: dataset: A list of json objects or a dictionary objects each @@ -241,7 +245,7 @@ def _input_sanity_checks( assert prompt_template[placeholder_start:] == answer_placeholder, "Answer field must be at prompt end" def _insert_text_in_template(self, input_example, prompt_template_fields, doc): - """ Format the input example according to the template """ + """Format the input example according to the template""" for field in prompt_template_fields: if field in doc.keys(): field_text = doc[field] @@ -255,7 +259,7 @@ def _insert_text_in_template(self, input_example, prompt_template_fields, doc): return input_example.strip(" ") def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits): - """ Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers """ + """Insert the correct number of pseudo tokens at the <|VIRTUAL_PROMPT_n|> markers""" total_inserted_tokens = 0 for idx in range(len(virtual_token_splits)): @@ -270,7 +274,7 @@ def _insert_virtual_token_placeholders(self, input_example, virtual_token_splits def _truncate_input( self, truncation_field, input_ids, taskname, doc, prompt_template, prompt_template_fields, virtual_token_splits ): - """ Try to truncate input text to fit into the max sequence length """ + """Try to truncate input text to fit into the max sequence length""" logging.info( f"Input greater than max sequence length. Attempting to truncate: '{truncation_field}' in task: '{taskname}'" ) @@ -297,8 +301,8 @@ def _truncate_input( return input_ids def _find_answer_start(self, taskname, input_ids, answer_field, doc): - """ Find the token ids corresponding to the answer start, for loss masking purposes. - Assumes the answer is always at the end of the prompt. + """Find the token ids corresponding to the answer start, for loss masking purposes. + Assumes the answer is always at the end of the prompt. """ answer_text = doc[answer_field] answer_text = self._add_leading_space(taskname, answer_field, answer_text) @@ -313,7 +317,7 @@ def _find_answer_start(self, taskname, input_ids, answer_field, doc): return answer_start_idx def _add_leading_space(self, taskname, field_name, field_text): - """ Add leading space to text if there is a space before it in the template """ + """Add leading space to text if there is a space before it in the template""" prompt_template = self.task_templates[taskname]["prompt_template"] field_text_start = prompt_template.find("{" + field_name + "}") if field_text_start != 0 and prompt_template[field_text_start - 1] == " ": @@ -331,7 +335,7 @@ def _ceil_to_nearest(self, n, m): return (n + m - 1) // m * m def collate_fn(self, batch, tp_workers=0): - """ Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch """ + """Prepares input_ids, labels, loss mask, attention_mask, and position ids for global batch""" taskname_ids, input_ids, answer_starts = zip(*batch) # Pad taskname_ids to be the same length for the prompt encoder @@ -380,7 +384,7 @@ def collate_fn(self, batch, tp_workers=0): return input_ids, labels, loss_mask, position_ids, attention_mask, taskname_ids def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): - """ Pad input_ids in batch to max batch length while building loss mask """ + """Pad input_ids in batch to max batch length while building loss mask""" batch_loss_masks = [] padded_input_ids = [] for ids, answer_start_idx in zip(input_ids, answer_starts): @@ -410,7 +414,7 @@ def pad_batch_and_build_loss_mask(self, input_ids, batch_max, answer_starts): def inference_collate_fn(self, batch): """ - Used for loading inference data. + Used for loading inference data. """ task_id_nums, input_ids, answer_starts = zip(*batch) input_lengths = torch.cuda.LongTensor([len(inputs) for inputs in input_ids]) diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py index 4070098b5e673..87174b69ffc2a 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_bert_dataset.py @@ -22,10 +22,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_bert_input_example import BERTQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BERTQADataset(QADataset): - """ Creates a Dataset for BERT architecture based Exractive QA """ + """Creates a Dataset for BERT architecture based Exractive QA""" def __init__( self, @@ -41,6 +42,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("BERTQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -92,7 +96,7 @@ def __init__( self.features[i] = BERTQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -110,7 +114,7 @@ def _set_cached_features_filename(self): ) def _convert_examples_to_features(self): - """ Converts loaded examples to features """ + """Converts loaded examples to features""" logging.info(f"Preprocessing data into features.") @@ -161,7 +165,7 @@ def _convert_examples_to_features(self): example.doc_tokens = doc_tokens # the text to tokens step is the slowest step - for (i, token) in enumerate(doc_tokens): + for i, token in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if token not in text_to_tokens_dict: text_to_tokens_dict[token] = self.tokenizer.text_to_tokens(token) @@ -199,7 +203,7 @@ def _convert_examples_to_features(self): # make compatible for hashing doc_spans = tuple(doc_spans) - for (doc_span_index, doc_span) in enumerate(doc_spans): + for doc_span_index, doc_span in enumerate(doc_spans): tokens = [self.tokenizer.cls_token] + query_tokens + [self.tokenizer.sep_token] segment_ids = [0 for i in range(len(tokens))] diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py index 783b2dd33f313..553f5984952ca 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_dataset.py @@ -28,14 +28,24 @@ ) from nemo.core.classes import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class QADataset(Dataset): - ''' Abstract base class for QA Datasets with common utility methods ''' + '''Abstract base class for QA Datasets with common utility methods''' def __init__( - self, data_file: str, processor: object, tokenizer: object, mode: str, num_samples: int, **kwargs, + self, + data_file: str, + processor: object, + tokenizer: object, + mode: str, + num_samples: int, + **kwargs, ): + # deprecation warning + deprecated_warning("QADataset") + self.mode = mode self.data_file = data_file self.processor = processor @@ -100,7 +110,7 @@ def get_best_span_index(doc_spans, position): best_score = None best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): + for span_index, doc_span in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue @@ -150,7 +160,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride): all_doc_tokens: list of all tokens in document max_tokens_for_doc: maximum number of tokens in each doc span doc_stride: stride size which sliding window moves with - + Returns: doc_spans: all possible doc_spans from document """ @@ -179,7 +189,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ doc_span tok_start_position: start position of answer in document tok_end_position: end position of answer in document - + Returns: average distance of doc_span to answer """ @@ -193,7 +203,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ @staticmethod def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode): """ - Filters out doc_spans, which might not be relevant to answering question, + Filters out doc_spans, which might not be relevant to answering question, which can be helpful when document is extremely long leading to many doc_spans with no answers Args: @@ -204,7 +214,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode all: do not filter only_positive: only keep doc_spans containing the answer limited_negative: only keep 10 doc_spans that are nearest to answer - + Returns: doc_spans: doc_spans after filtering """ @@ -282,9 +292,13 @@ def get_doc_tokens_and_offset_from_context_id( @staticmethod def improve_answer_span( - doc_tokens: List[str], input_start: int, input_end: int, tokenizer: object, orig_answer_text: str, + doc_tokens: List[str], + input_start: int, + input_end: int, + tokenizer: object, + orig_answer_text: str, ): - """ Returns tokenized answer spans that better match the annotated answer """ + """Returns tokenized answer spans that better match the annotated answer""" tok_answer_text = " ".join(tokenizer.text_to_tokens(orig_answer_text)) diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py index d6484b33e202d..1eeb312a62a92 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_gpt_dataset.py @@ -24,10 +24,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_gpt_input_example import GPTQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class GPTQADataset(QADataset): - """ Creates a Dataset for GPT architecture based Generative QA """ + """Creates a Dataset for GPT architecture based Generative QA""" def __init__( self, @@ -44,6 +45,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("GPTQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -76,7 +80,7 @@ def __init__( self.features[i] = GPTQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -120,7 +124,11 @@ def _convert_examples_to_features(self): formatted_query, query_tokens_length = self._prep_query(query_prefix, example) formatted_answer, answer_tokens_length = self._prep_answer(example) context_tokens, context_spans = self._prep_context( - example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens, + example, + query_tokens_length, + answer_tokens_length, + context_prefix_tokens, + answer_prefix_tokens, ) unique_id = self._encode_all_context_spans( @@ -170,7 +178,12 @@ def _prep_answer(self, example): return self._get_truncated_sentence_and_len(target, self.max_answer_length) def _prep_context( - self, example, query_tokens_length, answer_tokens_length, context_prefix_tokens, answer_prefix_tokens, + self, + example, + query_tokens_length, + answer_tokens_length, + context_prefix_tokens, + answer_prefix_tokens, ): """ Calculates the maximum possible length for a given context given a question diff --git a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py index 1f9a8ef615a9a..c65c8a43c4404 100644 --- a/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py +++ b/nemo/collections/nlp/data/question_answering/dataset/qa_s2s_dataset.py @@ -23,10 +23,11 @@ from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset from nemo.collections.nlp.data.question_answering.input_example.qa_s2s_input_example import S2SQAInputExample from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class S2SQADataset(QADataset): - """ Creates a Dataset for T5/BART architecture based Generative QA """ + """Creates a Dataset for T5/BART architecture based Generative QA""" def __init__( self, @@ -43,6 +44,9 @@ def __init__( mode: str = TRAINING_MODE, use_cache: bool = False, ): + # deprecation warning + deprecated_warning("S2SQADataset") + super().__init__( data_file=data_file, processor=processor, tokenizer=tokenizer, mode=mode, num_samples=num_samples ) @@ -75,7 +79,7 @@ def __init__( self.features[i] = S2SQAInputExample(**self.features[i]) def _set_cached_features_filename(self): - """ Creates cache filename using dataset config parameters """ + """Creates cache filename using dataset config parameters""" vocab_size = getattr(self.tokenizer, "vocab_size", 0) self.cached_features_file = ( @@ -117,7 +121,12 @@ def _convert_examples_to_features(self): context_tokens, context_spans = self._prep_context(example, query_tokens, context_prefix_tokens) unique_id = self._encode_all_context_spans( - unique_id, context_spans, context_tokens, formatted_query, example, example_index, + unique_id, + context_spans, + context_tokens, + formatted_query, + example, + example_index, ) # delete self.examples during training mode to save memory @@ -155,7 +164,13 @@ def _prep_context(self, example, query_tokens, context_prefix_tokens): return context_tokens, context_spans def _encode_all_context_spans( - self, unique_id, context_spans, context_tokens, formatted_query, example, example_index, + self, + unique_id, + context_spans, + context_tokens, + formatted_query, + example, + example_index, ): """ Fromats all spans extracted from a single context as: @@ -173,7 +188,11 @@ def _encode_all_context_spans( # encode input encoded_input_dict = self.tokenizer.tokenizer( - source, truncation=True, max_length=self.max_seq_length, padding="max_length", return_tensors="pt", + source, + truncation=True, + max_length=self.max_seq_length, + padding="max_length", + return_tensors="pt", ) input_ids = torch.squeeze(encoded_input_dict["input_ids"]) input_attn_mask = torch.squeeze(encoded_input_dict["attention_mask"]) @@ -223,7 +242,11 @@ def _encode_answer(self, example, context_span_text): target = example.answer_text encoded_output_dict = self.tokenizer.tokenizer( - target, truncation=True, max_length=self.max_answer_length, padding="max_length", return_tensors="pt", + target, + truncation=True, + max_length=self.max_answer_length, + padding="max_length", + return_tensors="pt", ) labels = torch.squeeze(encoded_output_dict["input_ids"]) labels[labels == self.tokenizer.tokenizer.pad_token_id] = -100 diff --git a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py index ee1a0957dbbb8..2abe9b7c0aaa6 100644 --- a/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py +++ b/nemo/collections/nlp/data/question_answering_squad/qa_dataset.py @@ -46,6 +46,7 @@ ) from nemo.core.classes import Dataset from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['SquadDataset', 'InputFeatures', '_check_is_max_context'] @@ -114,7 +115,7 @@ def get_best_span_index(doc_spans, position): """ best_score = None best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): + for span_index, doc_span in enumerate(doc_spans): end = doc_span.start + doc_span.length - 1 if position < doc_span.start: continue @@ -165,6 +166,9 @@ def __init__( mode: str, use_cache: bool, ): + # deprecation warning + deprecated_warning("SquadDataset") + self.tokenizer = tokenizer self.version_2_with_negative = version_2_with_negative self.processor = SquadProcessor(data_file=data_file, mode=mode) @@ -337,7 +341,7 @@ def get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride): all_doc_tokens: list of all tokens in document max_tokens_for_doc: maximum number of tokens in each doc span doc_stride: stride size which sliding window moves with - + Returns: doc_spans: all possible doc_spans from document """ @@ -375,7 +379,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ doc_span tok_start_position: start position of answer in document tok_end_position: end position of answer in document - + Returns: average distance of doc_span to answer """ @@ -387,7 +391,7 @@ def get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_ @staticmethod def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode): """ - Filters out doc_spans, which might not be relevant to answering question, + Filters out doc_spans, which might not be relevant to answering question, which can be helpful when document is extremely long leading to many doc_spans with no answers Args: @@ -398,7 +402,7 @@ def keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode all: do not filter only_positive: only keep doc_spans containing the answer limited_negative: only keep 10 doc_spans that are nearest to answer - + Returns: doc_spans: doc_spans after filtering """ @@ -481,7 +485,7 @@ def convert_examples_to_features( if self.mode != TRAINING_MODE: example.doc_tokens = doc_tokens # the text to tokens step is the slowest step - for (i, token) in enumerate(doc_tokens): + for i, token in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) if token not in text_to_tokens_dict: text_to_tokens_dict[token] = tokenizer.text_to_tokens(token) @@ -521,7 +525,7 @@ def convert_examples_to_features( # make compatible for hashing doc_spans = tuple(doc_spans) - for (doc_span_index, doc_span) in enumerate(doc_spans): + for doc_span_index, doc_span in enumerate(doc_spans): tokens = [tokenizer.cls_token] + query_tokens + [tokenizer.sep_token] segment_ids = [0 for i in range(len(tokens))] @@ -681,7 +685,7 @@ def get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(self.examples): + for example_index, example in enumerate(self.examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -706,7 +710,7 @@ def get_predictions( null_start_logit = 0 # end logit at the slice with min null score null_end_logit = 0 - for (feature_index, feature) in enumerate(features): + for feature_index, feature in enumerate(features): pos = unique_id_to_pos[feature.unique_id] start_indexes = get_best_indexes(start_logits[pos], n_best_size) end_indexes = get_best_indexes(end_logits[pos], n_best_size) @@ -825,7 +829,7 @@ def get_predictions( probs = _compute_softmax(total_scores) nbest_json = [] - for (i, entry) in enumerate(nbest): + for i, entry in enumerate(nbest): output = collections.OrderedDict() output["question"] = example.question_text output["text"] = entry.text diff --git a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py index 803d0eaf8aed2..c98abb300c64d 100644 --- a/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py +++ b/nemo/collections/nlp/data/spellchecking_asr_customization/bert_example.py @@ -20,6 +20,8 @@ from transformers import PreTrainedTokenizerBase +from nemo.utils.decorators import deprecated_warning + """Build BERT Examples from asr hypothesis, customization candidates, target labels, span info. """ @@ -52,7 +54,7 @@ def __init__( input_ids: indices of single characters (treated as subwords) input_mask: list of bools with 0s in place of input_ids to be masked segment_ids: list of ints from 0 to 10 to denote the text segment type ( - 0 - for tokens of ASR hypothesis, + 0 - for tokens of ASR hypothesis, 1 - for tokens of the first candidate ... 10 - for tokens of the tenth candidate @@ -60,7 +62,7 @@ def __init__( input_ids_for_subwords: indices of real subwords (as tokenized by bert tokenizer) input_mask_for_subwords: list of bools with 0s in place of input_ids_for_subwords to be masked segment_ids_for_subwords: same as segment_ids but for input_ids_for_subwords - character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) + character_pos_to_subword_pos: list of size=len(input_ids), value=(position of corresponding subword in input_ids_for_subwords) fragment_indices: list of tuples (start_position, end_position, candidate_id), end is exclusive, candidate_id can be -1 if not set labels_mask: bool tensor with 0s in place of label tokens to be masked labels: indices of semiotic classes which should be predicted from each of the @@ -68,6 +70,9 @@ def __init__( spans: list of tuples (class_id, start_position, end_position), end is exclusive, class is always 1(CUSTOM) default_label: The default label """ + # deprecation warning + deprecated_warning("BertExample") + input_len = len(input_ids) if not ( input_len == len(input_mask) @@ -123,6 +128,9 @@ def __init__( tokenizer: Tokenizer object. max_seq_length: Maximum sequence length. """ + # deprecation warning + deprecated_warning("BertExampleBuilder") + self._label_map = label_map self._semiotic_classes = semiotic_classes self._tokenizer = tokenizer @@ -183,9 +191,15 @@ def build_bert_example( tags[start:end] = [t for i in range(end - start)] # get input features for characters - (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = self._get_input_features( - hyp=hyp, ref=ref, tags=tags - ) + ( + input_ids, + input_mask, + segment_ids, + labels_mask, + labels, + _, + _, + ) = self._get_input_features(hyp=hyp, ref=ref, tags=tags) # get input features for words hyp_with_words = hyp.replace(" ", "").replace("_", " ") @@ -243,11 +257,11 @@ def build_bert_example( return example def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: - """ Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample - - Example: - span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] + """Converts span_info string into a list of (class_id, start, end) where start, end are coordinates of starting and ending(exclusive) tokens in input_ids of BertExample + + Example: + span_info_parts: ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] + result: [(1, 38, 42), (1, 48, 53), (1, 43, 47), (1, 1, 8)] """ result_spans = [] @@ -267,26 +281,26 @@ def _get_spans(self, span_info_parts: List[str]) -> List[Tuple[int, int, int]]: def _get_fragment_indices( self, hyp: str, targets: List[int], span_info_parts: List[str] ) -> Tuple[List[Tuple[int, int, int]]]: - """ Build fragment indices for real candidates. - This is used only at inference. - After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). - In this function we - 1) adjust start/end positions to match word borders (possibly in multiple ways). - 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). - - Args: - hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). - targets: list of candidate ids (only for real candidates, not dummy) - span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. - Returns: - List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. - Note that returned fragments can be unsorted and can overlap, it's ok. - Example: - hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - targets: [1 2 3 4 6 7 9] - span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. - fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] - """ + """Build fragment indices for real candidates. + This is used only at inference. + After external candidate retrieval we know approximately, where the candidate is located in the text (from the positions of matched n-grams). + In this function we + 1) adjust start/end positions to match word borders (possibly in multiple ways). + 2) generate content for fragment_indices tensor (it will be used during inference to average all predictions inside each fragment). + + Args: + hyp: ASR-hypothesis where space separates single characters (real space is replaced to underscore). + targets: list of candidate ids (only for real candidates, not dummy) + span_info_parts: list of strings of format like "CUSTOM 12 25", corresponding to each of targets, with start/end coordinates in text. + Returns: + List of tuples (start, end, target) where start and end are positions in ASR-hypothesis, target is candidate_id. + Note that returned fragments can be unsorted and can overlap, it's ok. + Example: + hyp: "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" + targets: [1 2 3 4 6 7 9] + span_info_parts: ["CUSTOM 12 25", "CUSTOM 0 10", "CUSTOM 27 42", ...], where numbers are EXPECTED start/end positions of corresponding target candidates in the text. These positions will be adjusted in this functuion. + fragment_indices: [(1, 12, 2), (13, 24, 1), (13, 28, 1), ..., (29, 42, 3)] + """ fragment_indices = [] @@ -337,18 +351,18 @@ def _get_fragment_indices( return fragment_indices def _map_characters_to_subwords(self, input_ids: List[int], input_ids_for_subwords: List[int]) -> List[int]: - """ Maps each single character to the position of its corresponding subword. - - Args: - input_ids: List of character token ids. - input_ids_for_subwords: List of subword token ids. - Returns: - List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) - - Example: - input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] - input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] - result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] + """Maps each single character to the position of its corresponding subword. + + Args: + input_ids: List of character token ids. + input_ids_for_subwords: List of subword token ids. + Returns: + List of subword positions in input_ids_for_subwords. Its length is equal to len(input_ids) + + Example: + input_ids: [101, 1037, 1055, 1056, 1054, 1051, 1050, ..., 1051, 102, 1040, ..., 1050, 102, 1037, ..., 1041, 102, ..., 102] + input_ids_for_subwords: [101, 26357, 2106, 2666, 2061, 8202, 1998, 13012, 16643, 2319, 1043, 7174, 102, 2106, 3771, 7842, 2819, 2239, 102, ..., 102] + result: [0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5, 5, 5, ... , 45, 46, 46, 46, 46, 46, 47] """ character_pos_to_subword_pos = [0 for _ in input_ids] @@ -453,7 +467,7 @@ def _get_input_features( ref: "didier saumon;astronomie;tristan guillot;tristesse;monade;christian;astronomer;solomon;dididididi;mercy" tags: None (not used for word-based case) - resulting token sequence: + resulting token sequence: '[CLS]', 'astronomers', 'did', '##ie', 'so', '##mon', 'and', 'tri', '##sti', '##an', 'g', '##llo', '[SEP]', 'did', '##ier', 'sa', '##um', '##on', '[SEP]', 'astro', '##no', '##mie', '[SEP]', 'tristan', 'gui', '##llo', '##t', '[SEP]', ..., '[SEP]', 'mercy', '[SEP]'] """ @@ -542,9 +556,9 @@ def read_input_file( infer: If true, input examples do not contain target info. Returns: - examples: List of converted examples (BertExample). + examples: List of converted examples (BertExample). or - (examples, hyps_refs): If infer==true, returns h + (examples, hyps_refs): If infer==true, returns h """ if not path.exists(input_filename): diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py index 7737bfa67f00c..07ca790866c7c 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_classification_model.py @@ -45,14 +45,19 @@ from nemo.collections.nlp.parts.nlp_overrides import NLPSaveRestoreConnector from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueGPTClassificationModel'] class DialogueGPTClassificationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueGPTClassificationModel") self.cfg = cfg self.eval_mode = cfg.dataset.eval_mode @@ -101,14 +106,14 @@ def __init__( def setup_optimizer_param_groups(self): """ - ModelPT override for prompt learning. - Optimizer will get self._optimizer_param_groups. + ModelPT override for prompt learning. + Optimizer will get self._optimizer_param_groups. Makes two optimizer param groups, one for the frozen model params - and one for the prompt-table/prompt-encoder params. The learning + and one for the prompt-table/prompt-encoder params. The learning rate for the frozen model's params will always be zero effectively freezing the model's params but still allowing for the needed gradients - to be passed around in pipeline parallel models. The prompt-encoder - and/or prompt table will use the learning rate set by the user. + to be passed around in pipeline parallel models. The prompt-encoder + and/or prompt table will use the learning rate set by the user. """ if not self.prompt_learning: super().setup_optimizer_param_groups() @@ -328,7 +333,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True): len(self.language_model.pseudo_token_ids) if hasattr(self.language_model, 'pseudo_token_ids') else 0 ) position_ids = torch.arange( - start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device, + start=0, + end=num_prompt_tokens + input_ids.size(1), + dtype=torch.long, + device=input_ids.device, ) prompt_ids = self.get_virtual_prompt_ids_for_megatron_gpt(input_ids) @@ -708,7 +716,9 @@ def prepare_data(self): ) elif self._cfg.dataset.task == 'design': self.dialogues_processor = DialogueDesignDataProcessor( - data_dir=self._cfg.dataset.data_dir, tokenizer=self.tokenizer, cfg=self._cfg.dataset, + data_dir=self._cfg.dataset.data_dir, + tokenizer=self.tokenizer, + cfg=self._cfg.dataset, ) else: raise ValueError("Only sgd, assistant, zero_shot, design supported for Dialogue GPT Classification Model") diff --git a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py index 602c15a50c761..116605b65d528 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_gpt_generation_model.py @@ -35,6 +35,7 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueGPTGenerationModel'] @@ -43,8 +44,12 @@ class DialogueGPTGenerationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueGPTGenerationModel") self.cfg = cfg self.data_prepared = False @@ -108,7 +113,10 @@ def eval_epoch_end(self, outputs, mode='val'): ) DialogueGenerationMetrics.save_predictions( - filename, generated_field, ground_truth_field, inputs, + filename, + generated_field, + ground_truth_field, + inputs, ) label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) @@ -155,7 +163,10 @@ def forward(self, input_ids, attention_mask, labels, inference=True): ) position_ids = torch.arange( - start=0, end=num_prompt_tokens + input_ids.size(1), dtype=torch.long, device=input_ids.device, + start=0, + end=num_prompt_tokens + input_ids.size(1), + dtype=torch.long, + device=input_ids.device, ) position_ids = position_ids.unsqueeze(0).repeat(input_ids.size(0), 1) @@ -228,7 +239,7 @@ def setup(self, stage=None): def prepare_megatron_generation(self, labels, input_ids, template_length): """ - # adapted from MegatronGPTModel._bucketize_gpt_inference + # adapted from MegatronGPTModel._bucketize_gpt_inference """ batch_size = labels.size(0) prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_learning else None diff --git a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py index 455b0fa17a856..29e2627fa038a 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_nearest_neighbour_model.py @@ -34,14 +34,18 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueNearestNeighbourModel'] class DialogueNearestNeighbourModel(NLPModel): - """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions """ + """Dialogue Nearest Neighbour Model identifies the intent of an utterance using the cosine similarity between sentence embeddings of the utterance and various label descriptions""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("DialogueNearestNeighbourModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer) if self.cfg.library == "huggingface": @@ -155,7 +159,10 @@ def on_validation_epoch_end(self): filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") DialogueGenerationMetrics.save_predictions( - filename, predicted_labels, ground_truth_labels, decoded_inputs, + filename, + predicted_labels, + ground_truth_labels, + decoded_inputs, ) label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} diff --git a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py index 9655fbea2722a..73f09f62b1d5a 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_s2s_generation_model.py @@ -32,6 +32,7 @@ from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator @@ -46,8 +47,12 @@ class DialogueS2SGenerationModel(NLPModel): def __init__( - self, cfg: DictConfig, trainer: Trainer = None, + self, + cfg: DictConfig, + trainer: Trainer = None, ): + # deprecation warning + deprecated_warning("DialogueS2SGenerationModel") self.cfg = cfg self.data_prepared = False @@ -120,7 +125,10 @@ def eval_epoch_end(self, outputs, mode='val'): ) DialogueGenerationMetrics.save_predictions( - filename, generated_field, ground_truth_field, inputs, + filename, + generated_field, + ground_truth_field, + inputs, ) label_acc = np.mean([int(generated_field[i] == ground_truth_field[i]) for i in range(len(generated_field))]) @@ -172,7 +180,7 @@ def forward(self, input_ids, attention_masks, labels): def prepare_megatron_generation(self, labels, input_ids, template_length): """ - # adapted from MegatronGPTModel._bucketize_gpt_inference + # adapted from MegatronGPTModel._bucketize_gpt_inference """ batch_size = labels.size(0) prompt_tags = [self.prompt_tags[0]] * batch_size if self.prompt_tags else None diff --git a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py index 0e007a7bcdd1b..5298c060df089 100644 --- a/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py +++ b/nemo/collections/nlp/models/dialogue/dialogue_zero_shot_intent_model.py @@ -36,6 +36,7 @@ from nemo.collections.nlp.models import TextClassificationModel from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['DialogueZeroShotIntentModel'] @@ -44,6 +45,9 @@ class DialogueZeroShotIntentModel(TextClassificationModel): """TextClassificationModel to be trained on two- or three-class textual entailment data, to be used for zero shot intent recognition.""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("DialogueZeroShotIntentModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer) @@ -275,7 +279,10 @@ def on_validation_epoch_end(self, split="val"): filename = os.path.join(self.cfg.dataset.dialogues_example_dir, "test_predictions.jsonl") DialogueGenerationMetrics.save_predictions( - filename, predicted_labels, ground_truth_labels, utterances, + filename, + predicted_labels, + ground_truth_labels, + utterances, ) label_to_ids = {label: idx for idx, label in enumerate(list(set(predicted_labels + ground_truth_labels)))} @@ -316,7 +323,6 @@ def predict( entailment_idx=1, contradiction_idx=0, ) -> List[Dict]: - """ Given a list of queries and a list of candidate labels, return a ranked list of labels and scores for each query. diff --git a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py index a34afa64674df..777d468084e22 100644 --- a/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py +++ b/nemo/collections/nlp/models/dialogue/intent_slot_classification_model.py @@ -35,12 +35,15 @@ from nemo.core.classes import typecheck from nemo.core.classes.common import PretrainedModelInfo from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class IntentSlotClassificationModel(NLPModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): - """ Initializes BERT Joint Intent and Slot model. - """ + """Initializes BERT Joint Intent and Slot model.""" + # deprecation warning + deprecated_warning("IntentSlotClassificationModel") + self.max_seq_length = cfg.dataset.max_seq_length self.cfg = cfg # Check the presence of data_dir. @@ -78,7 +81,7 @@ def _set_defaults_data_desc(self, cfg): OmegaConf.set_struct(cfg, True) def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): - """ Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc. """ + """Method creates IntentSlotDataDesc and copies generated values to cfg.data_desc.""" # Save data from data desc to config - so it can be reused later, e.g. in inference. data_desc = IntentSlotDataDesc(data_dir=data_dir, modes=[train_ds.prefix, validation_ds.prefix]) OmegaConf.set_struct(cfg, False) @@ -112,7 +115,7 @@ def _set_data_desc_to_cfg(self, cfg, data_dir, train_ds, validation_ds): OmegaConf.set_struct(cfg, True) def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: - """ Saves label ids map to a file """ + """Saves label ids map to a file""" with open(filename, 'w') as out: labels, _ = zip(*sorted(label_ids.items(), key=lambda x: x[1])) out.write('\n'.join(labels)) @@ -120,7 +123,7 @@ def _save_label_ids(self, label_ids: Dict[str, int], filename: str) -> None: logging.info(f'Labels mapping saved to : {out.name}') def _reconfigure_classifier(self): - """ Method reconfigures the classifier depending on the settings of model cfg.data_desc """ + """Method reconfigures the classifier depending on the settings of model cfg.data_desc""" self.classifier = SequenceTokenClassifier( hidden_size=self.hidden_size, @@ -310,7 +313,7 @@ def get_utterance_tokens(self, token_ids, token_masks): Args: token_ids: IntTensor of size (max_seq_len, ) token_masks: BoolTensor of size (max_seq_len, ) - + Returns token_list: List of Str (list of tokens with len <= max_seq_len) """ diff --git a/nemo/collections/nlp/models/dialogue/sgdqa_model.py b/nemo/collections/nlp/models/dialogue/sgdqa_model.py index b350fd01fa090..3b30dfccd9cee 100644 --- a/nemo/collections/nlp/models/dialogue/sgdqa_model.py +++ b/nemo/collections/nlp/models/dialogue/sgdqa_model.py @@ -35,6 +35,7 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['SGDQAModel'] @@ -44,7 +45,7 @@ class SGDQAModel(NLPModel): Dialogue State Tracking Model SGD-QA (https://arxiv.org/abs/2105.08049) The SGD-QA model is a fast multi-pass schema-guided state-tracking model, that is trained on the Google schema-guided state tracking dataset (https://arxiv.org/abs/1909.05855). - The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. + The model takes dialogue as input and outputs the dialogue state, which includes slot-value pairs. The model consists of two components: a neural natural language understanding model (NLU), and a rule-based state tracker. The NLU takes in a dialogue turn and different schema (entity) information options and outputs their match score. The state tracker takes the highest rated entities and composes the dialogue state across turns. @@ -55,6 +56,9 @@ def output_module(self): return self.decoder def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("SGDQAModel") + self.data_prepared = False super().__init__(cfg=cfg, trainer=trainer) self.encoder = SGDEncoder(hidden_size=self.bert_model.config.hidden_size, dropout=self._cfg.encoder.dropout) @@ -146,7 +150,7 @@ def validation_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_ Called at every validation step to aggregate and postprocess outputs on each GPU Args: batch: input batch at validation step - batch_idx: batch index + batch_idx: batch index dataloader_idx: dataloader index """ loss, tensors = self.eval_step_helper(batch=batch) @@ -163,7 +167,7 @@ def test_step(self, batch: List[torch.Tensor], batch_idx: int, dataloader_idx: i Called at every test step to aggregate and postprocess outputs on each GPU Args: batch: input batch at test step - batch_idx: batch index + batch_idx: batch index dataloader_idx: dataloader index """ loss, tensors = self.eval_step_helper(batch=batch) @@ -318,8 +322,8 @@ def eval_step_helper(self, batch: List[torch.Tensor]): torch.zeros(total_scores.size(), device=total_scores.get_device(), dtype=total_scores.dtype), total_scores, ) - max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens ** 2), axis=-1) - max_span_p = torch.max(total_scores.view(-1, max_num_tokens ** 2), axis=-1)[0] + max_span_index = torch.argmax(total_scores.view(-1, max_num_tokens**2), axis=-1) + max_span_p = torch.max(total_scores.view(-1, max_num_tokens**2), axis=-1)[0] span_start_index = torch.floor_divide(max_span_index, max_num_tokens) span_end_index = torch.fmod(max_span_index, max_num_tokens) @@ -415,7 +419,7 @@ def format_turn_id(ex_id_num): def combine_predictions_in_example(predictions: dict, batch_size: int): ''' - Combines predicted values to a single example. + Combines predicted values to a single example. Args: predictions: predictions ordered by keys then batch batch_size: batch size diff --git a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py index f3ef3ccb87f99..4afae81e38939 100644 --- a/nemo/collections/nlp/models/entity_linking/entity_linking_model.py +++ b/nemo/collections/nlp/models/entity_linking/entity_linking_model.py @@ -26,6 +26,7 @@ from nemo.core.classes.exportable import Exportable from nemo.core.neural_types import LogitsType, NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['EntityLinkingModel'] @@ -44,6 +45,9 @@ def output_types(self) -> Optional[Dict[str, NeuralType]]: def __init__(self, cfg: DictConfig, trainer: Trainer = None): """Initializes the SAP-BERT model for entity linking.""" + # deprecation warning + deprecated_warning("EntityLinkingModel") + # tokenizer needed before super().__init__() so dataset and loader can process data self._setup_tokenizer(cfg.tokenizer) @@ -123,7 +127,7 @@ def on_validation_epoch_end(self): Args: outputs: list of individual outputs of each validation step. Returns: - + """ if self.validation_step_outputs: avg_loss = torch.stack( diff --git a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py index 4a073e2ada1ca..4447ebb893862 100644 --- a/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py +++ b/nemo/collections/nlp/models/glue_benchmark/glue_benchmark_model.py @@ -31,6 +31,7 @@ from nemo.core.classes import typecheck from nemo.core.neural_types import NeuralType from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['GLUEModel'] @@ -78,6 +79,8 @@ def __init__(self, cfg: DictConfig, trainer: Trainer = None): """ Initializes model to use BERT model for GLUE tasks. """ + # deprecation warning + deprecated_warning("GLUEModel") if cfg.task_name not in cfg.supported_tasks: raise ValueError(f'{cfg.task_name} not in supported task. Choose from {cfg.supported_tasks}') diff --git a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py index e7ae529fe4e28..67a4802d83f6c 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/bert/bert_model.py @@ -14,7 +14,6 @@ """BERT model.""" -import warnings from dataclasses import dataclass import torch @@ -33,6 +32,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.enums import AttnMaskType @@ -142,7 +142,13 @@ def forward(self, hidden_states, word_embeddings_weight): def post_language_model_processing( - lm_output, pooled_output, lm_head, binary_head, lm_labels, logit_weights, fp16_lm_cross_entropy, + lm_output, + pooled_output, + lm_head, + binary_head, + lm_labels, + logit_weights, + fp16_lm_cross_entropy, ): # lm_logits: [s, b, vocab_size] lm_logits = lm_head(lm_output, logit_weights) @@ -348,7 +354,10 @@ def __init__(self, transformer_block_type='pre-ln', add_pooler=True, *args, **kw if self.post_process: # TODO: Make sure you are passing in the mpu_vocab_size properly - self.lm_head = MCoreBertLMHead(self.config.hidden_size, self.config,) + self.lm_head = MCoreBertLMHead( + self.config.hidden_size, + self.config, + ) self.output_layer = tensor_parallel.ColumnParallelLinear( self.config.hidden_size, @@ -476,10 +485,9 @@ def __init__( sequence_parallel=False, position_embedding_type='learned_absolute', ): - warnings.warn( - "NeMoBertModel will be deprecated mid 2024. Use MCoreBertModelWrapperWithPostLNSupport instead.", - DeprecationWarning, - ) + # deprecation warning + deprecated_warning("NeMoBertModel", "MCoreBertModelWrapperWithPostLNSupport") + super(NeMoBertModel, self).__init__(config=config) self.fp16_lm_cross_entropy = fp16_lm_cross_entropy self.add_binary_head = add_binary_head diff --git a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py index 19fafb796fd73..c572d94acd110 100755 --- a/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron/gpt_model.py @@ -24,6 +24,7 @@ parallel_lm_logits, scaled_init_method_normal, ) +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.enums import AttnMaskType @@ -167,6 +168,9 @@ def __init__( seq_len_interpolation_factor=None, rotary_base=10000, ): + # deprecation warning + deprecated_warning("GPTModel", "McoreGPTModel") + super(GPTModel, self).__init__(config=config, share_token_embeddings=share_embeddings_and_output_weights) self.parallel_output = parallel_output @@ -250,7 +254,9 @@ def __init__( if self.share_embeddings_and_output_weights: self.initialize_word_embeddings( - init_method=init_method_normal(init_method_std), vocab_size=vocab_size, hidden_size=hidden_size, + init_method=init_method_normal(init_method_std), + vocab_size=vocab_size, + hidden_size=hidden_size, ) def set_input_tensor(self, input_tensor): @@ -299,9 +305,11 @@ def forward( post_process_result = post_language_model_processing( loss_lm_output, loss_labels, - self.language_model.output_layer.weight - if not self.share_embeddings_and_output_weights - else self.word_embeddings_weight(), + ( + self.language_model.output_layer.weight + if not self.share_embeddings_and_output_weights + else self.word_embeddings_weight() + ), get_key_value, self.parallel_output, forward_method_parallel_output, diff --git a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py index d151925635ab6..f6ee4b20183c5 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_base_prompt_learning_model.py @@ -37,6 +37,7 @@ from nemo.collections.nlp.modules.common.transformer.text_generation import TextGeneration from nemo.collections.nlp.parts.nlp_overrides import GradScaler from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import _reconfigure_microbatch_calculator @@ -82,6 +83,9 @@ class MegatronBasePromptLearningModel(MegatronBaseModel, TextGeneration): """ def __init__(self, cfg: DictConfig, trainer: Trainer): + # deprecation warning + deprecated_warning("MegatronBasePromptLearningModel") + super().__init__(cfg, trainer) self.init_model(cfg, trainer) diff --git a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py index 5ee7a3fcf4806..acfc22439a7db 100644 --- a/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py +++ b/nemo/collections/nlp/models/language_modeling/megatron_gpt_prompt_learning_model.py @@ -44,6 +44,7 @@ from nemo.collections.nlp.parts.nlp_overrides import GradScaler, NLPSaveRestoreConnector from nemo.collections.nlp.parts.utils_funcs import get_last_rank from nemo.utils import AppState, logging +from nemo.utils.decorators import deprecated_warning try: from apex.transformer.pipeline_parallel.utils import get_micro_batch_size, get_num_microbatches @@ -72,25 +73,28 @@ class MegatronGPTPromptLearningModel(MegatronBasePromptLearningModel): """ - Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. + Model class for prompt-tuning or p-tuning a pretrained Megatron GPT model. Prompt Tuning initalizes virtual prompt embeddings directly from a copy of certain token embeddings from the the pretrained GPT model's vocabulary - and directly tunes these embedding weights. The token embeddings used in - initalization are specified by the user in the config file. The model can - be prompt-tuned for multiple tasks at once. virtual prompts are stored in a - prompt table and can be added or deleted without disrupting virtual prompts - for other tasks. + and directly tunes these embedding weights. The token embeddings used in + initalization are specified by the user in the config file. The model can + be prompt-tuned for multiple tasks at once. virtual prompts are stored in a + prompt table and can be added or deleted without disrupting virtual prompts + for other tasks. P-tuning initializes an LSTM encoder model that generates virtual prompt embeddings for every task. Each task shares the same encoder. After ptuning is compelete, the learned virtual prompts can be saved to the prompt table - using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a - new virtual prompt via p-tuning, they do not need to retrain on all previous + using add_ptuned_prompts_to_prompt_table(). Thus, if a user wants to add a + new virtual prompt via p-tuning, they do not need to retrain on all previous tasks. This gives p-tuning the same task flexiblity as prompt-tuning. """ def __init__(self, cfg: DictConfig, trainer: Trainer): + # deprecation warning + deprecated_warning("MegatronGPTPromptLearningModel") + super().__init__(cfg, trainer) self.inference_params = None @@ -305,8 +309,8 @@ def forward( def fwd_bwd_step(self, dataloader_iter, batch_idx, forward_only): """ - Dataloader produces a global batch which is turned into an iterator of microbatches. - The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. + Dataloader produces a global batch which is turned into an iterator of microbatches. + The iterator of microbatches is then piped through the pipeline using Core's fwd/bwd functions. """ # Get seq length of batch batch, _, _ = next(dataloader_iter) @@ -361,15 +365,15 @@ def training_step(self, dataloader_iter): return loss_mean def backward(self, *args, **kwargs): - """ LightningModule hook to do backward. - We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core. - No need to call it here. + """LightningModule hook to do backward. + We want this to do nothing since we run backward in the fwd/bwd functions from megatron-core. + No need to call it here. """ return def optimizer_zero_grad(self, *args, **kwargs): - """ LightningModule hook to zero grad. - We want this to do nothing as we are zeroing grads during the training_step. + """LightningModule hook to zero grad. + We want this to do nothing as we are zeroing grads during the training_step. """ return @@ -415,11 +419,19 @@ def validation_step(self, dataloader_iter): labels_text.append(label) if mode == 'val': self.validation_step_outputs.append( - {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,} + { + 'loss': loss_mean, + 'preds': preds_text, + 'labels': labels_text, + } ) else: self.test_step_outputs.append( - {'loss': loss_mean, 'preds': preds_text, 'labels': labels_text,} + { + 'loss': loss_mean, + 'preds': preds_text, + 'labels': labels_text, + } ) return { 'loss': loss_mean, @@ -427,8 +439,10 @@ def validation_step(self, dataloader_iter): 'labels': labels_text, } - self.validation_step_outputs.append({'loss': loss_mean}) if mode == 'val' else self.test_step_outputs.append( - {'loss': loss_mean} + ( + self.validation_step_outputs.append({'loss': loss_mean}) + if mode == 'val' + else self.test_step_outputs.append({'loss': loss_mean}) ) return {'loss': loss_mean} @@ -481,7 +495,8 @@ def on_validation_epoch_end(self): gather_results_dedup = list(set(itertools.chain(*gather_results))) val_metric_dict = self.validation_metric.get_score( - [i[1] for i in gather_results_dedup], [i[0] for i in gather_results_dedup], + [i[1] for i in gather_results_dedup], + [i[0] for i in gather_results_dedup], ) for metric, val in val_metric_dict.items(): @@ -638,9 +653,9 @@ def build_virtual_prompt_dataset( drop_last=drop_last, num_workers=num_workers, pin_memory=pin_memory, - persistent_workers=True - if num_workers > 0 - else False, # (@adithyare and @eharper) We need this to make spawn=True to work. + persistent_workers=( + True if num_workers > 0 else False + ), # (@adithyare and @eharper) We need this to make spawn=True to work. ) return dataset, dataloader @@ -815,7 +830,7 @@ def list_available_models(cls): def get_pseudo_tokens(num_virtual_tokens): """ Takes in an integer and returns a list of strings where each string - is a numbered virtual token placeholder. If + is a numbered virtual token placeholder. If num_virtual_tokens = 3, then this function returns: ["", "", ""] @@ -823,7 +838,7 @@ def get_pseudo_tokens(num_virtual_tokens): Args: num_virtual_tokens: (int) Number of virtual token strings you want to make - returns a list of string. + returns a list of string. """ pseudo_tokens = [ diff --git a/nemo/collections/nlp/models/question_answering/qa_base_model.py b/nemo/collections/nlp/models/question_answering/qa_base_model.py index bfb45f51b6ac7..7ca78f2e136e4 100644 --- a/nemo/collections/nlp/models/question_answering/qa_base_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_base_model.py @@ -25,10 +25,14 @@ ) from nemo.collections.nlp.models.nlp_model import NLPModel from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BaseQAModel(NLPModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None, no_lm_init=True): + # deprecation warning + deprecated_warning("BaseQAModel") + self.cfg = cfg super().__init__(cfg=cfg, trainer=trainer, no_lm_init=no_lm_init) @@ -82,10 +86,13 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): @torch.no_grad() def _get_per_sample_perplexity(self, logits, labels): - """ Returns average perplexity for each sample in the batch """ + """Returns average perplexity for each sample in the batch""" loss_fct = torch.nn.CrossEntropyLoss(ignore_index=-100, reduction='none') - unreduced_loss = loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1),) + unreduced_loss = loss_fct( + logits.view(-1, logits.size(-1)), + labels.view(-1), + ) unreduced_loss = unreduced_loss.reshape(labels.shape) mask_0 = unreduced_loss != 0 per_sample_perplexity = torch.exp((unreduced_loss * mask_0).sum(axis=1) / mask_0.sum(axis=1)) diff --git a/nemo/collections/nlp/models/question_answering/qa_bert_model.py b/nemo/collections/nlp/models/question_answering/qa_bert_model.py index 196fab4e3a046..d4bdef6d871dc 100644 --- a/nemo/collections/nlp/models/question_answering/qa_bert_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_bert_model.py @@ -31,12 +31,15 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class BERTQAModel(BaseQAModel): - """ BERT model with a QA (token classification) head """ + """BERT model with a QA (token classification) head""" def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("BERTQAModel") super().__init__(cfg=cfg, trainer=trainer, no_lm_init=False) self.classifier = TokenClassifier( @@ -190,7 +193,7 @@ def inference( num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions - + Returns: model predictions, model nbest list """ @@ -209,7 +212,10 @@ def inference( logging.set_verbosity(logging.WARNING) infer_datalayer = self.setup_inference_data( - file, batch_size=batch_size, num_samples=num_samples, num_workers=2, + file, + batch_size=batch_size, + num_samples=num_samples, + num_workers=2, ) all_logits = [] @@ -244,7 +250,9 @@ def inference( if output_prediction_file: QAMetrics.dump_predicted_answers_to_file( - output_prediction_file, infer_datalayer.dataset.examples, all_predictions, + output_prediction_file, + infer_datalayer.dataset.examples, + all_predictions, ) if output_nbest_file: @@ -324,7 +332,7 @@ def get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() scores_diff_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -349,7 +357,7 @@ def get_predictions( null_start_logit = 0 # end logit at the slice with min null score null_end_logit = 0 - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] start_indexes = self._get_best_indexes(start_logits[pos], n_best_size) end_indexes = self._get_best_indexes(end_logits[pos], n_best_size) @@ -468,7 +476,7 @@ def get_predictions( probs = _compute_softmax(total_scores) nbest_json = [] - for (i, entry) in enumerate(nbest): + for i, entry in enumerate(nbest): output = collections.OrderedDict() output["question"] = example.question_text output["text"] = entry.text @@ -531,7 +539,7 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_best_indexes(self, logits, n_best_size): - """ Get the n-best logits from a list """ + """Get the n-best logits from a list""" best_indices = np.argsort(logits)[::-1] @@ -570,7 +578,7 @@ def _get_final_text(self, pred_text: str, orig_text: str, do_lower_case: bool, v def _strip_spaces(text): ns_chars = [] ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): + for i, c in enumerate(text): if c == " ": continue ns_to_s_map[len(ns_chars)] = i @@ -599,14 +607,16 @@ def _strip_spaces(text): if len(orig_ns_text) != len(tok_ns_text): if verbose_logging: logging.warning( - "Length not equal after stripping spaces: '%s' vs '%s'", orig_ns_text, tok_ns_text, + "Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, + tok_ns_text, ) return orig_text # We then project the characters in `pred_text` back to `orig_text` using # the character-to-character alignment. tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): + for i, tok_index in tok_ns_to_s_map.items(): tok_s_to_ns_map[tok_index] = i orig_start_position = None diff --git a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py index 405b9a1e05ade..059cf5625f150 100644 --- a/nemo/collections/nlp/models/question_answering/qa_gpt_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_gpt_model.py @@ -27,10 +27,14 @@ from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class GPTQAModel(BaseQAModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("GPTQAModel") + self.cfg = cfg self.setup_tokenizer(cfg.tokenizer) @@ -102,7 +106,11 @@ def on_validation_epoch_end(self): eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset eval_results, _, _ = self.evaluate( - eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers, + eval_dataset.features, + eval_dataset.examples, + unique_ids, + per_sample_perplexity, + generated_answers, ) self.log(f'{prefix}_loss', avg_loss) @@ -185,10 +193,19 @@ def inference( return all_predictions, all_nbest_perdictions def evaluate( - self, features, examples, unique_ids, per_sample_perplexity, generated_texts, + self, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ): all_predictions, all_nbest_predictions = self._get_predictions( - features, examples, unique_ids, per_sample_perplexity, generated_texts, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ) eval_results = QAMetrics.evaluate_predictions(examples, all_predictions) @@ -226,7 +243,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_predictions( - self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List, + self, + features, + examples: List, + unique_ids: List[int], + per_sample_perplexity: List, + generated_texts: List, ): unique_id_to_pos = {} for index, unique_id in enumerate(unique_ids): @@ -242,7 +264,7 @@ def _get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -250,7 +272,7 @@ def _get_predictions( curr_features = example_index_to_features[example_index] prelim_predictions = [] - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] curr_perplexity = per_sample_perplexity[pos] curr_generated_text = generated_texts[pos] diff --git a/nemo/collections/nlp/models/question_answering/qa_model.py b/nemo/collections/nlp/models/question_answering/qa_model.py index 6fb2054a22370..2147d7d6a5bfc 100644 --- a/nemo/collections/nlp/models/question_answering/qa_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_model.py @@ -32,6 +32,7 @@ from nemo.collections.nlp.parts.utils_funcs import tensor2list from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning __all__ = ['QAModel'] @@ -42,6 +43,9 @@ class QAModel(NLPModel): """ def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("QAModel") + super().__init__(cfg=cfg, trainer=trainer) self.classifier = TokenClassifier( hidden_size=self.hidden_size, @@ -186,7 +190,7 @@ def inference( num_samples: number of samples to use of inference data. Default: -1 if all data should be used. output_nbest_file: optional output file for writing out nbest list output_prediction_file: optional output file for writing out predictions - + Returns: model predictions, model nbest list """ diff --git a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py index 81001fb66da7b..5ad959fd1b6f6 100644 --- a/nemo/collections/nlp/models/question_answering/qa_s2s_model.py +++ b/nemo/collections/nlp/models/question_answering/qa_s2s_model.py @@ -28,10 +28,13 @@ from nemo.collections.nlp.models.question_answering.qa_base_model import BaseQAModel from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.utils import logging +from nemo.utils.decorators import deprecated_warning class S2SQAModel(BaseQAModel): def __init__(self, cfg: DictConfig, trainer: Trainer = None): + # deprecation warning + deprecated_warning("S2SQAModel") self.cfg = cfg @@ -120,7 +123,11 @@ def on_validation_epoch_end(self): eval_dataset = self._test_dl.dataset if self.trainer.testing else self._validation_dl.dataset eval_results, _, _ = self.evaluate( - eval_dataset.features, eval_dataset.examples, unique_ids, per_sample_perplexity, generated_answers, + eval_dataset.features, + eval_dataset.examples, + unique_ids, + per_sample_perplexity, + generated_answers, ) self.log(f'{prefix}_loss', avg_loss) @@ -145,7 +152,11 @@ def forward(self, input_ids, input_attn_mask, labels): labels = torch.where(labels != -100, labels, torch.zeros_like(labels)) output_attn_masks = torch.where(labels > 0, torch.ones_like(labels), torch.zeros_like(labels)) unmasked_unreduced_loss = self.language_model( - input_ids, labels[:, :-1], input_attn_mask, output_attn_masks[:, :-1], lm_labels=labels[:, 1:], + input_ids, + labels[:, :-1], + input_attn_mask, + output_attn_masks[:, :-1], + lm_labels=labels[:, 1:], ) loss = self.language_model.loss_func(output_attn_masks[:, 1:], unmasked_unreduced_loss) per_sample_perplexity = torch.exp(unmasked_unreduced_loss) @@ -210,10 +221,19 @@ def inference( return all_predictions, all_nbest_predictions def evaluate( - self, features, examples, unique_ids, per_sample_perplexity, generated_texts, + self, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ): all_predictions, all_nbest_json = self._get_predictions( - features, examples, unique_ids, per_sample_perplexity, generated_texts, + features, + examples, + unique_ids, + per_sample_perplexity, + generated_texts, ) eval_results = QAMetrics.evaluate_predictions(examples, all_predictions) @@ -251,7 +271,12 @@ def _setup_dataloader_from_config(self, cfg: DictConfig, mode: str): return data_loader def _get_predictions( - self, features, examples: List, unique_ids: List[int], per_sample_perplexity: List, generated_texts: List, + self, + features, + examples: List, + unique_ids: List[int], + per_sample_perplexity: List, + generated_texts: List, ): unique_id_to_pos = {} @@ -268,7 +293,7 @@ def _get_predictions( all_predictions = collections.OrderedDict() all_nbest_json = collections.OrderedDict() - for (example_index, example) in enumerate(examples): + for example_index, example in enumerate(examples): # finish this loop if we went through all batch examples if example_index >= len(unique_ids): @@ -276,7 +301,7 @@ def _get_predictions( curr_features = example_index_to_features[example_index] prelim_predictions = [] - for (feature_index, feature) in enumerate(curr_features): + for feature_index, feature in enumerate(curr_features): pos = unique_id_to_pos[feature.unique_id] curr_perplexity = per_sample_perplexity[pos] curr_generated_text = generated_texts[pos] @@ -339,7 +364,10 @@ def _generate_candidates(self, input_ids, input_attn_mask): "max_length": num_tokens_to_generate, } generated_tokens = self.language_model.generate(**param_dict) - generated_answers = self.tokenizer.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True,) + generated_answers = self.tokenizer.tokenizer.batch_decode( + generated_tokens, + skip_special_tokens=True, + ) generated_answers = [ans.strip() for ans in generated_answers] elif self.cfg.library == 'megatron': diff --git a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py index eed94f2e1e315..d9e08f6764fc2 100644 --- a/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py +++ b/nemo/collections/nlp/models/spellchecking_asr_customization/spellchecking_model.py @@ -35,7 +35,7 @@ from nemo.core.classes.common import PretrainedModelInfo, typecheck from nemo.core.neural_types import LogitsType, NeuralType from nemo.utils import logging -from nemo.utils.decorators import experimental +from nemo.utils.decorators import deprecated_warning, experimental __all__ = ["SpellcheckingAsrCustomizationModel"] @@ -48,7 +48,7 @@ class SpellcheckingAsrCustomizationModel(NLPModel): It takes as input ASR hypothesis and candidate customization entries. It labels the hypothesis with correct entry index or 0. Example input: [CLS] a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o [SEP] d i d i e r _ s a u m o n [SEP] a s t r o n o m i e [SEP] t r i s t a n _ g u i l l o t [SEP] ... - Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 + Input segments: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 Example output: 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 3 3 3 3 3 3 3 3 3 3 3 3 3 0 ... """ @@ -67,6 +67,9 @@ def output_module(self): return self def __init__(self, cfg: DictConfig, trainer: Trainer = None) -> None: + # deprecation warning + deprecated_warning("SpellcheckingAsrCustomizationModel") + super().__init__(cfg=cfg, trainer=trainer) # Label map contains 11 labels: 0 for nothing, 1..10 for target candidate ids @@ -321,7 +324,7 @@ def on_test_epoch_end(self): @torch.no_grad() def infer(self, dataloader_cfg: DictConfig, input_name: str, output_name: str) -> None: - """ Main function for Inference + """Main function for Inference Args: dataloader_cfg: config for dataloader @@ -517,7 +520,7 @@ def _setup_infer_dataloader(self, cfg: DictConfig, input_name: str) -> 'torch.ut Setup function for a infer data loader. Args: cfg: config dictionary containing data loader params like batch_size, num_workers and pin_memory - input_name: path to input file. + input_name: path to input file. Returns: A pytorch DataLoader. """ diff --git a/nemo/utils/decorators/__init__.py b/nemo/utils/decorators/__init__.py index 4468a3bc09b5e..2cfec9e40d648 100644 --- a/nemo/utils/decorators/__init__.py +++ b/nemo/utils/decorators/__init__.py @@ -13,6 +13,6 @@ # limitations under the License. -from nemo.utils.decorators.deprecated import deprecated +from nemo.utils.decorators.deprecated import deprecated, deprecated_warning from nemo.utils.decorators.experimental import experimental from nemo.utils.decorators.port_docs import add_port_docs diff --git a/nemo/utils/decorators/deprecated.py b/nemo/utils/decorators/deprecated.py index 65f92e62563e0..40957bb343d42 100644 --- a/nemo/utils/decorators/deprecated.py +++ b/nemo/utils/decorators/deprecated.py @@ -30,14 +30,14 @@ def deprecated(wrapped=None, version=None, explanation=None, wait_seconds=0): """ - Decorator which can be used for indicating that a function/class is deprecated and going to be removed. - Tracks down which function/class printed the warning and will print it only once per call. - - Args: - version: Version in which the function/class will be removed (optional). - explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional). - wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned - with subsequent logging messages. + Decorator which can be used for indicating that a function/class is deprecated and going to be removed. + Tracks down which function/class printed the warning and will print it only once per call. + + Args: + version: Version in which the function/class will be removed (optional). + explanation: Additional explanation, e.g. "Please, ``use another_function`` instead." (optional). + wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned + with subsequent logging messages. """ if wrapped is None: @@ -71,3 +71,26 @@ def wrapper(wrapped, instance, args, kwargs): return wrapped(*args, **kwargs) return wrapper(wrapped) + + +def deprecated_warning(old_method=None, new_method=None, wait_seconds=2): + """ + Function which can be used for indicating that a function/class is deprecated and going to be removed. + + Args: + old_method: Name of deprecated class/function. + new_method: Name of new class/function to use. + wait_seconds: Sleep for a few seconds after the deprecation message appears in case it gets drowned + with subsequent logging messages. + """ + + # Create a banner + if new_method is not None: + msg = f"***** {old_method} is deprecated. Please, use {new_method} instead. *****" + else: + msg = f"***** {old_method} is deprecated and will be removed soon. *****" + banner = '\n'.join(['*' * len(msg)] * 2 + [msg] + ['*' * len(msg)] * 2) + + logging.warning(f"\n\n{banner}\n") + logging.warning(f"Waiting for {wait_seconds} seconds before this message disappears.") + time.sleep(wait_seconds) diff --git a/tests/collections/nlp/test_dialogue.py b/tests/collections/nlp/test_dialogue.py deleted file mode 100644 index 9c227f737d988..0000000000000 --- a/tests/collections/nlp/test_dialogue.py +++ /dev/null @@ -1,278 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -import torch - -from nemo.collections.nlp.data.dialogue.data_processor.assistant_data_processor import DialogueAssistantDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.data_processor import DialogueDataProcessor -from nemo.collections.nlp.data.dialogue.data_processor.sgd_data_processor import DialogueSGDDataProcessor -from nemo.collections.nlp.data.dialogue.dataset.dialogue_gpt_classification_dataset import ( - DialogueGPTClassificationDataset, -) -from nemo.collections.nlp.data.dialogue.dataset.dialogue_s2s_generation_dataset import DialogueS2SGenerationDataset -from nemo.collections.nlp.data.dialogue.dataset.dialogue_sgd_bert_dataset import DialogueSGDBERTDataset -from nemo.collections.nlp.metrics.dialogue_metrics import DialogueClassificationMetrics, DialogueGenerationMetrics -from nemo.collections.nlp.models.dialogue.dialogue_nearest_neighbour_model import DialogueNearestNeighbourModel - - -@pytest.mark.unit -def test_dialogue_metric_generation_f1(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - precision, recall, f1 = DialogueGenerationMetrics._get_one_f1(generated_field, ground_truth_field) - assert precision == 75 - assert recall == 75 - assert f1 == 75 - - -@pytest.mark.unit -def test_dialogue_metric_split_label_and_slots(): - fields = ["reserve_restaurant\nslots: time_of_day(7pm), number_of_people(3)", "time_of_day(7pm)"] - labels, slots_list = DialogueClassificationMetrics.split_label_and_slots(fields, with_slots=True) - assert labels == ["reserve_restaurant", 'none'] - assert slots_list == [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]] - - -@pytest.mark.unit -def test_dialogue_metric_slot_filling_metrics(): - generated_slots = [["time_of_day(7pm)", "number_of_people(3)"], ["time_of_day(7pm)"]] - ground_truth_slots = [["time_of_day(7pm)"], ["time_of_day(7pm)", "number_of_people(3)"]] - - ( - avg_precision, - avg_recall, - avg_f1, - avg_joint_goal_accuracy, - ) = DialogueClassificationMetrics.get_slot_filling_metrics(generated_slots, ground_truth_slots) - - assert avg_precision == 75 - assert avg_recall == 75 - assert avg_f1 == 75 - assert avg_joint_goal_accuracy == 0 - - -@pytest.mark.unit -def test_dialogue_assistant_data_processor_normalize_zero_shot_intent(): - label0 = 'food_ordering.contextual_query' - normalized_label0 = 'contextual query' - - label1 = 'food_ordering.nomatch' - normalized_label1 = 'no match' - - label2 = 'food_ordering.no' - normalized_label2 = 'no' - - assert normalized_label0 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label0) - assert normalized_label1 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label1) - assert normalized_label2 == DialogueAssistantDataProcessor.normalize_zero_shot_intent(label2) - - -@pytest.mark.unit -def test_dialogue_assistant_data_processor_get_continuous_slots(): - slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {19: [3, 5], 18: [5, 6]} - - # here 18 and 19 maps to the same slot (originally variants of B-slot and I-slot) - slot_ids = [54, 54, 54, 19, 19, 18, 54, 54, 54] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 18, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {18: [3, 6]} - - # test if function works when non-empty slots are at boundary - slot_ids = [18, 54, 54, 19, 19] - empty_slot_id = 54 - bio_slot_ids_to_unified_slot_ids = {18: 18, 19: 19, 54: 54} - continuous_slots = DialogueAssistantDataProcessor.get_continuous_slots( - slot_ids, empty_slot_id, bio_slot_ids_to_unified_slot_ids - ) - assert continuous_slots == {18: [0, 1], 19: [3, 5]} - - -@pytest.mark.unit -def test_dialogue_assistant_map_bio_format_slots_to_unified_slots(): - - slots = ['B-time', 'I-time', 'B-alarm', 'I-alarm', 'O'] - gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '0', '2': '1', '3': '1', '4': '2'} - gt_unified_slots = ['time', 'alarm', 'O'] - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots) - assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids - assert gt_unified_slots == unified_slots - - # case in which BIOS scheme was not used in annotation - slots = ['time', 'alarm', 'O'] - gt_bio_slot_ids_to_unified_slot_ids = {'0': '0', '1': '1', '2': '2'} - gt_unified_slots = ['time', 'alarm', 'O'] - ( - bio_slot_ids_to_unified_slot_ids, - unified_slots, - ) = DialogueAssistantDataProcessor.map_bio_format_slots_to_unified_slots(slots) - - assert gt_bio_slot_ids_to_unified_slot_ids == bio_slot_ids_to_unified_slot_ids - assert gt_unified_slots == unified_slots - - -@pytest.mark.unit -def test_dialogue_data_processor_get_relevant_idxs(): - - dataset_split = 'train' - dev_proportion = 10 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 900 - assert idxs != list(range(900)) - - dataset_split = 'dev' - dev_proportion = 40 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 400 - assert idxs != list(range(400)) - - dataset_split = 'test' - dev_proportion = 40 - n_samples = 1000 - idxs = DialogueDataProcessor.get_relevant_idxs(dataset_split, n_samples, dev_proportion) - - assert len(idxs) == 1000 - assert idxs == list(range(1000)) - - -@pytest.mark.unit -def test_dialogue_sgd_data_processor_convert_camelcase_to_lower(): - label = 'none' - gt_converted_label = 'none' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - label = 'ReserveRestaurant' - gt_converted_label = 'reserve restaurant' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - label = 'Alarm' - gt_converted_label = 'alarm' - - assert gt_converted_label == DialogueSGDDataProcessor.convert_camelcase_to_lower(label) - - -@pytest.mark.unit -def test_dialogue_gpt_classification_dataset_linearize_slots(): - - slots = [] - linearized_slots = 'None' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - slots = {'time': '7pm', 'place': 'field'} - linearized_slots = 'time(7pm), place(field)' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - slots = {'time': ['7pm', '1900'], 'place': 'field'} - linearized_slots = 'time(7pm), place(field)' - assert linearized_slots == DialogueGPTClassificationDataset.linearize_slots(slots) - - -@pytest.mark.unit -def test_dialogue_gpt_classification_dataset_linearize_slots(): - - actions = [ - {'act': 'inform', 'slot': 'time', 'values': ['7pm', '1900']}, - {'act': 'confirm', 'slot': 'place', 'values': ['hall']}, - ] - - prompt_template = 'values' - formatted_actions = '7pm hall' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - prompt_template = 'slots_values' - formatted_actions = 'time (7pm) place (hall)' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - prompt_template = 'acts_slots_values' - formatted_actions = 'inform time (7pm) confirm place (hall)' - assert formatted_actions == DialogueS2SGenerationDataset.format_actions(prompt_template, actions) - - -@pytest.mark.unit -def test_dialogue_sgd_dataset_naive_tokenize(): - - utterance = 'I am feeling hungry so I would like to find a place to eat.' - tokens = [ - 'I', - ' ', - 'am', - ' ', - 'feeling', - ' ', - 'hungry', - ' ', - 'so', - ' ', - 'I', - ' ', - 'would', - ' ', - 'like', - ' ', - 'to', - ' ', - 'find', - ' ', - 'a', - ' ', - 'place', - ' ', - 'to', - ' ', - 'eat', - '.', - ] - assert tokens == DialogueSGDBERTDataset._naive_tokenize(utterance) - - -@pytest.mark.unit -def test_dialogue_nearest_neighbour_mean_pooling(): - - model_output = [torch.ones(8, 512, 768)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.ones(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) - - model_output = [torch.zeros(8, 512, 768)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.zeros(8, 768).float(), DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) - - model_output = [torch.cat([torch.zeros(8, 256, 768), torch.ones(8, 256, 768)], axis=1)] - attention_mask = torch.ones(8, 512) - assert torch.equal( - torch.ones(8, 768).float() * 0.5, DialogueNearestNeighbourModel.mean_pooling(model_output, attention_mask) - ) diff --git a/tests/collections/nlp/test_entity_linking_model.py b/tests/collections/nlp/test_entity_linking_model.py deleted file mode 100644 index 16b7681842964..0000000000000 --- a/tests/collections/nlp/test_entity_linking_model.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -import shutil -import tempfile - -import pytest -import wget -from omegaconf import OmegaConf - -from nemo.collections.nlp.models import EntityLinkingModel - - -def get_cfg(): - - language_model = OmegaConf.create( - {"pretrained_model_name": "bert-base-uncased", "config_file": None, "config": None, "lm_checkpoint": None} - ) - - tokenizer = OmegaConf.create( - {"tokenizer_name": "bert-base-uncased", "vocab_file": None, "tokenizer_model": None, "do_lower_case": True} - ) - - model = OmegaConf.create( - { - "nemo_path": "sap_entity_linking.nemo", - "max_seq_length": 128, - "language_model": language_model, - "tokenizer": tokenizer, - "train_ds": None, - "validation_ds": None, - } - ) - - cfg = OmegaConf.create({"model": model}) - - return cfg - - -class TestEntityLinkingModel: - @pytest.mark.with_downloads() - @pytest.mark.unit - def test_creation_saving_restoring(self): - # Create a new temporary directory - with tempfile.TemporaryDirectory() as restore_dir: - with tempfile.TemporaryDirectory() as save_dir: - model = EntityLinkingModel(cfg=get_cfg().model) - assert isinstance(model, EntityLinkingModel) - - save_dir_path = save_dir - - # Where model will be saved - model_save_path = os.path.join(save_dir, f"{model.__class__.__name__}.nemo") - model.save_to(save_path=model_save_path) - - # Where model will be restored from - model_restore_path = os.path.join(restore_dir, f"{model.__class__.__name__}.nemo") - shutil.copy(model_save_path, model_restore_path) - - # at this point save_dir should not exist - assert save_dir_path is not None and not os.path.exists(save_dir_path) - assert not os.path.exists(model_save_path) - assert os.path.exists(model_restore_path) - - # attempt to restore - model_copy = model.__class__.restore_from(restore_path=model_restore_path) - assert model.num_weights == model_copy.num_weights - - -if __name__ == "__main__": - t = TestEntityLinkingModel() - t.test_creation_saving_restoring() diff --git a/tests/collections/nlp/test_megatron.py b/tests/collections/nlp/test_megatron.py deleted file mode 100644 index 8206457ec6eee..0000000000000 --- a/tests/collections/nlp/test_megatron.py +++ /dev/null @@ -1,81 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -try: - import apex - - apex_available = True -except Exception: - apex_available = False - -import os -import tempfile - -import onnx -import pytest -import torch -from omegaconf import OmegaConf - -import nemo.collections.nlp as nemo_nlp -from nemo.core.classes import typecheck - - -def get_pretrained_bert_345m_uncased_model(): - model_name = "megatron-bert-345m-uncased" - config = {"language_model": {"pretrained_model_name": model_name}, "tokenizer": {}} - omega_conf = OmegaConf.create(config) - model = nemo_nlp.modules.get_lm_model(cfg=omega_conf) - if torch.cuda.is_available(): - model = model.cuda() - return model - - -class TestMegatron: - @pytest.mark.skip("This test was written for megatron-lm") - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_list_pretrained_models(self): - pretrained_lm_models = nemo_nlp.modules.get_pretrained_lm_models_list() - assert len(pretrained_lm_models) > 0 - - @pytest.mark.with_downloads() - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - @pytest.mark.skip("Only one Megatron model is allowed") - def test_get_model(self): - model = get_pretrained_bert_345m_uncased_model() - assert isinstance(model, nemo_nlp.modules.MegatronBertEncoder) - - typecheck.set_typecheck_enabled(enabled=False) - inp = model.input_example() - out = model.forward(*inp) - typecheck.set_typecheck_enabled(enabled=True) - - @pytest.mark.skipif(not os.path.exists('/home/TestData'), reason='Not a Jenkins machine') - @pytest.mark.with_downloads() - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - @pytest.mark.skip("Megatron-LM BERT support deprecated. Supported in NeMo < 1.5") - def test_onnx_export(self): - model = get_pretrained_bert_345m_uncased_model() - assert model - with tempfile.TemporaryDirectory() as tmpdir: - # Generate filename in the temporary directory. - # Test export. - model.export(os.path.join(".", "megatron.onnx")) - - -if __name__ == "__main__": - t = TestMegatron() - t.test_onnx_export() diff --git a/tests/collections/nlp/test_mem_map_dataset.py b/tests/collections/nlp/test_mem_map_dataset.py deleted file mode 100644 index 20932b6c4e0df..0000000000000 --- a/tests/collections/nlp/test_mem_map_dataset.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import csv -import json -import os - -import pytest - -from nemo.collections.nlp.data.language_modeling import text_memmap_dataset - - -@pytest.fixture -def jsonl_file(tmp_path): - # Create a temporary file path - file_path = tmp_path / "data.jsonl" - - # Generate data to write to the JSONL file - data = [ - {"name": "John", "age": 30}, - {"name": "Jane", "age": 25}, - {"name": "Bob", "age": 35}, - ] - - # Write data to the JSONL file - with open(file_path, mode="w") as file: - for item in data: - json.dump(item, file) - file.write("\n") - - # Provide the file path to the test function - yield str(file_path) - - # Optional: Clean up the temporary file after the test - file_path.unlink() - - -@pytest.fixture -def csv_file(tmp_path): - # Create a temporary file path - file_path = tmp_path / "data.csv" - - # Generate data to write to the CSV file - data = [["ID", "Name"], [1, "John"], [2, "Jane"], [3, "Bob"]] - - # Write data to the CSV file - with open(file_path, mode="w", newline="") as file: - writer = csv.writer(file) - writer.writerows(data) - - # Provide the file path to the test function - yield str(file_path) - - # Optional: Clean up the temporary file after the test - file_path.unlink() - - -def test_jsonl_mem_map_dataset(jsonl_file): - """Test for JSONL memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) - assert indexed_dataset[0] == {"name": "John", "age": 30} - assert indexed_dataset[1] == {"name": "Jane", "age": 25} - assert indexed_dataset[2] == {"name": "Bob", "age": 35} - - -def test_csv_mem_map_dataset(csv_file): - """Test for CSV memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.CSVMemMapDataset(dataset_paths=[csv_file], data_col=1, header_lines=1) - assert indexed_dataset[0].strip() == "John" - assert indexed_dataset[1].strip() == "Jane" - assert indexed_dataset[2].strip() == "Bob" - - -def test_csv_fields_mem_map_dataset(csv_file): - """Test for CSV memory-mapped datasets.""" - - indexed_dataset = text_memmap_dataset.CSVFieldsMemmapDataset( - dataset_paths=[csv_file], data_fields={"ID": 0, "Name": 1}, header_lines=1 - ) - assert isinstance(indexed_dataset[0], dict) - assert sorted(indexed_dataset[0].keys()) == ["ID", "Name"] - assert indexed_dataset[0]["ID"] == "1" and indexed_dataset[1]["ID"] == "2" and indexed_dataset[2]["ID"] == "3" - assert ( - indexed_dataset[0]["Name"].strip() == "John" - and indexed_dataset[1]["Name"].strip() == "Jane" - and indexed_dataset[2]["Name"].strip() == "Bob" - ) - - -@pytest.mark.parametrize( - "dataset_class", [text_memmap_dataset.JSONLMemMapDataset, text_memmap_dataset.CSVMemMapDataset], -) -@pytest.mark.parametrize("use_alternative_index_mapping_dir", [True, False]) -@pytest.mark.parametrize("relative_index_fn", [True, False]) -def test_mem_map_dataset_index_mapping_dir( - tmp_path, dataset_class, jsonl_file, use_alternative_index_mapping_dir, relative_index_fn, -): - """Test for index_mapping_dir.""" - if relative_index_fn: - jsonl_file = os.path.relpath(jsonl_file) - else: - jsonl_file = os.path.abspath(jsonl_file) - - if use_alternative_index_mapping_dir: - index_mapping_dir = tmp_path / "subdir" - dataset_class(dataset_paths=[jsonl_file], header_lines=0, index_mapping_dir=str(index_mapping_dir)) - # Index files should not be created in default location. - assert not os.path.isfile(f"{jsonl_file}.idx.npy") - assert not os.path.isfile(f"{jsonl_file}.idx.info") - if relative_index_fn: - # Remove leading ".." sequences. - while jsonl_file.startswith(("../")): - jsonl_file = jsonl_file.lstrip("../") - idx_fn = f"{str(index_mapping_dir)}/{jsonl_file}.idx" - assert os.path.isfile(f"{idx_fn}.npy") - assert os.path.isfile(f"{idx_fn}.info") - else: - text_memmap_dataset.JSONLMemMapDataset(dataset_paths=[jsonl_file], header_lines=0) - assert os.path.isfile(f"{jsonl_file}.idx.npy") - assert os.path.isfile(f"{jsonl_file}.idx.info") diff --git a/tests/collections/nlp/test_prompt_learning.py b/tests/collections/nlp/test_prompt_learning.py deleted file mode 100644 index 4597fe9ecef08..0000000000000 --- a/tests/collections/nlp/test_prompt_learning.py +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import json -import os - -import pytest -import torch - -from nemo.collections.nlp.data.language_modeling.megatron.gpt_prompt_learning_dataset import GPTPromptLearningDataset -from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import get_pseudo_tokens -from nemo.collections.nlp.modules.common import VirtualPromptSource -from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer -from nemo.core import Dataset - - -def get_prompt_tuning_dataset( - dataset_path, tokenizer, virtual_prompt_source, task_templates, pseudo_tokens, -): - dataset = GPTPromptLearningDataset( - data=[dataset_path], - tokenizer=tokenizer, - virtual_prompt_source=virtual_prompt_source, - task_templates=task_templates, - pseudo_tokens=pseudo_tokens, - pad_token_id=tokenizer.unk_id, - max_seq_length=512, - min_seq_length=1, - ) - - return dataset - - -def create_temp_dataset(): - example_dataset_a = [ - {'taskname': 'task name A', 'text': 'Test sentence one, Answer: ', 'answer': 'test'} for i in range(24) - ] - example_dataset_b = [ - {'taskname': 'task name B', 'question': 'This is a question', 'answer': 'test'} for i in range(13) - ] - example_dataset = example_dataset_a + example_dataset_b - temp_file_name = 'temp_dataset_file.jsonl' - - with open(temp_file_name, 'w') as temp: - for example in example_dataset: - temp.write(json.dumps(example) + '\n') - - return temp_file_name - - -def get_task_templates(): - task_templates = {} - task_templates['task name A'] = { - "prompt_template": "<|VIRTUAL_PROMPT_0|>{text}{answer}", - "prompt_template_fields": ['text', 'answer'], - "total_virtual_tokens": 5, - "virtual_token_splits": [5], - "truncate_field": None, - "answer_only_loss": True, - "answer_field": "answer", - "task_id_num": 0, - } - task_templates['task name B'] = { - "prompt_template": "<|VIRTUAL_PROMPT_0|>{question}<|VIRTUAL_PROMPT_1|>{answer}{extra}", - "prompt_template_fields": ['question', 'answer', 'extra'], - "total_virtual_tokens": 10, - "virtual_token_splits": [7, 3], - "truncate_field": None, - "answer_only_loss": False, - "answer_field": None, - "task_id_num": 1, - } - return task_templates - - -class TestMegatronGPTPromptLearningDataset: - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_init_prompt_learning_dataset(self): - tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') - task_templates = get_task_templates() - dataset_path = create_temp_dataset() - - # Setup virtual token place holders - total_virtual_tokens = 10 - pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) - tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens}) - - dataset = get_prompt_tuning_dataset( - dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, - ) - - print(type(dataset)) - - assert isinstance(dataset, Dataset) - - os.remove(dataset_path) - - @pytest.mark.run_only_on('GPU') - @pytest.mark.unit - def test_prompt_learning_dataset_collate_fn_prompt_encoder(self): - tokenizer = get_nmt_tokenizer(library='megatron', model_name='GPT2BPETokenizer') - task_templates = get_task_templates() - dataset_path = create_temp_dataset() - - # Setup virtual token place holders - total_virtual_tokens = 10 - pseudo_tokens = get_pseudo_tokens(total_virtual_tokens) - tokenizer.add_special_tokens({'additional_special_tokens': pseudo_tokens}) - - dataset = get_prompt_tuning_dataset( - dataset_path, tokenizer, VirtualPromptSource.PROMPT_ENCODER, task_templates, pseudo_tokens, - ) - - batch = [dataset[i] for i in range(8)] - batch = dataset.collate_fn(batch) - - assert len(batch) == 6 - - _, _, _, _, _, taskname_ids = batch - - assert list(taskname_ids[0].numpy()) == tokenizer.text_to_ids("task name A") - - os.remove(dataset_path) - - -if __name__ == "__main__": - t = TestMegatronGPTPromptLearningDataset() - t.test_init_prompt_learning_dataset() - t.test_prompt_learning_dataset_collate_fn_prompt_encoder() - print('-' * 50 + '\nALL PROMPT TUNING UNIT TESTS PASS!\n' + '-' * 50) diff --git a/tests/collections/nlp/test_qna.py b/tests/collections/nlp/test_qna.py deleted file mode 100644 index 4a470cacb7113..0000000000000 --- a/tests/collections/nlp/test_qna.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections - -import pytest -import torch - -from nemo.collections.nlp.data.question_answering.dataset.qa_dataset import QADataset -from nemo.collections.nlp.data.question_answering.dataset.qa_gpt_dataset import GPTQADataset -from nemo.collections.nlp.metrics.qa_metrics import QAMetrics - - -@pytest.mark.unit -def test_remove_articles(): - sentences = [ - "this is an apple", - "this is the apple", - "this is a fruit", - ] - - expected_article_removed_sents = ["this is apple", "this is apple", "this is fruit"] - - article_removed_sents = [QAMetrics.remove_articles(sent) for sent in sentences] - - assert article_removed_sents == expected_article_removed_sents - - -@pytest.mark.unit -def test_white_space_fix(): - sentences = [ - "sentence with a space", - "sentence with multiple spaces", - ] - - expected_white_space_fixed_sents = [ - "sentence with a space", - "sentence with multiple spaces", - ] - - white_space_fixed_sents = [QAMetrics.white_space_fix(sent) for sent in sentences] - - assert white_space_fixed_sents == expected_white_space_fixed_sents - - -@pytest.mark.unit -def test_remove_punc(): - sentence = "this, is. a! sentence: with; punctuations?" - expected_punc_removed_sent = "this is a sentence with punctuations" - - punc_removed_sent = QAMetrics.remove_punc(sentence) - - assert punc_removed_sent == expected_punc_removed_sent - - -@pytest.mark.unit -def test_get_normalized_tokens(): - sentence = 'I am happy' - tokens = ['i', 'am', 'happy'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - sentence = 'I am a person' - tokens = ['i', 'am', 'person'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - sentence = 'I am a person.' - tokens = ['i', 'am', 'person'] - assert tokens == QAMetrics._get_normalized_tokens(sentence) - - -@pytest.mark.unit -def test_get_one_f1(): - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field) - assert f1 == 0.75 - - generated_field = '' - ground_truth_field = 'That' - - f1 = QAMetrics.get_one_f1(generated_field, ground_truth_field) - assert f1 == 0 - - -@pytest.mark.unit -def test_get_one_exact_match(): - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 0 - - generated_field = 'That is so good!' - ground_truth_field = 'That is so good.' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 1 - - generated_field = 'That is so good' - ground_truth_field = 'that is so good' - - em = QAMetrics.get_one_exact_match(generated_field, ground_truth_field) - assert em == 1 - - -@pytest.mark.unit -def test_split_into_words(): - text = 'hi yo' - char_to_word_offset = [0, 0, 0, 1, 1] - doc_tokens = ["hi", "yo"] - output = QADataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - text = 'i am good' - char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2] - doc_tokens = ["i", "am", 'good'] - output = QADataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - -@pytest.mark.unit -def test_get_doc_spans(): - all_doc_tokens = ['a'] * 15 - max_tokens_for_doc = 10 - doc_stride = 5 - doc_spans = QADataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride) - - assert len(doc_spans) == 2 - assert doc_spans[0].start == 0 - assert doc_spans[0].length == 10 - assert doc_spans[1].start == 5 - assert doc_spans[1].length == 10 - - -@pytest.mark.unit -def test_get_average_dist_to_tok_start_and_end(): - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_span = _DocSpan(start=0, length=5) - - tok_start_position = 1 - tok_end_position = 3 - - assert 2 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=5) - - tok_start_position = 1 - tok_end_position = 2 - - assert 6 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=4) - - tok_start_position = 1 - tok_end_position = 2 - - assert 5 == QADataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - -@pytest.mark.unit -def test_keep_relevant_docspans(): - - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'all' - assert doc_spans == QADataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = -1 - tok_end_position = -1 - - mode = 'only_positive' - - expected_doc_spans = [] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'only_positive' - - expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'limited_negative' - - expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)] - assert expected_doc_spans == QADataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - -@pytest.mark.unit -def test_gpt_no_pad_loss_masking(): - input_ids = [1] * 15 + [50257] * 15 - input_ids = torch.tensor(input_ids) - - input_attn_mask = [1] * 16 + [0] * 14 - input_attn_mask = torch.Tensor(input_attn_mask) - - training_mask_end = 10 - - expected_labels = [-100] * 10 + [1] * 5 + [50257] + [-100] * 14 - expected_labels = torch.tensor(expected_labels) - - labels = GPTQADataset.update_labels_for_no_pad_loss(input_ids, training_mask_end, input_attn_mask) - - assert torch.all(labels.eq(expected_labels)) diff --git a/tests/collections/nlp/test_question_answering.py b/tests/collections/nlp/test_question_answering.py deleted file mode 100644 index c4aacf449c501..0000000000000 --- a/tests/collections/nlp/test_question_answering.py +++ /dev/null @@ -1,185 +0,0 @@ -# Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import collections -from pydoc import doc - -import pytest - -from nemo.collections.nlp.data.question_answering_squad.qa_dataset import SquadDataset -from nemo.collections.nlp.data.question_answering_squad.qa_squad_processing import ( - _get_tokens, - exact_match_score, - f1_score, -) - - -@pytest.mark.unit -def test_get_tokens(): - sentence = 'I am happy' - tokens = ['i', 'am', 'happy'] - assert tokens == _get_tokens(sentence) - - sentence = 'I am a person' - tokens = ['i', 'am', 'person'] - assert tokens == _get_tokens(sentence) - - sentence = 'I am a person.' - tokens = ['i', 'am', 'person'] - assert tokens == _get_tokens(sentence) - - -@pytest.mark.unit -def test_f1_score(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - f1 = f1_score(generated_field, ground_truth_field) - assert f1 == 0.75 - - generated_field = '' - ground_truth_field = 'That' - - f1 = f1_score(generated_field, ground_truth_field) - assert f1 == 0 - - -@pytest.mark.unit -def test_exact_match_score(): - - generated_field = 'That is so good' - ground_truth_field = 'That is so awesome' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 0 - - generated_field = 'That is so good!' - ground_truth_field = 'That is so good.' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 1 - - generated_field = 'That is so good' - ground_truth_field = 'that is so good' - - em = exact_match_score(generated_field, ground_truth_field) - assert em == 1 - - -@pytest.mark.unit -def test_split_into_words(): - text = 'hi yo' - char_to_word_offset = [0, 0, 0, 1, 1] - doc_tokens = ["hi", "yo"] - output = SquadDataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - text = 'i am good' - char_to_word_offset = [0, 0, 1, 1, 1, 2, 2, 2, 2] - doc_tokens = ["i", "am", 'good'] - output = SquadDataset.split_into_words(text) - assert output[0] == doc_tokens - assert output[1] == char_to_word_offset - - -@pytest.mark.unit -def test_get_doc_spans(): - all_doc_tokens = ['a'] * 15 - max_tokens_for_doc = 10 - doc_stride = 5 - doc_spans = SquadDataset.get_docspans(all_doc_tokens, max_tokens_for_doc, doc_stride) - - assert len(doc_spans) == 2 - assert doc_spans[0].start == 0 - assert doc_spans[0].length == 10 - assert doc_spans[1].start == 5 - assert doc_spans[1].length == 10 - - -@pytest.mark.unit -def test_get_average_dist_to_tok_start_and_end(): - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_span = _DocSpan(start=0, length=5) - - tok_start_position = 1 - tok_end_position = 3 - - assert 2 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=5) - - tok_start_position = 1 - tok_end_position = 2 - - assert 6 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - doc_span = _DocSpan(start=5, length=4) - - tok_start_position = 1 - tok_end_position = 2 - - assert 5 == SquadDataset.get_average_dist_to_tok_start_and_end(doc_span, tok_start_position, tok_end_position) - - -@pytest.mark.unit -def test_keep_relevant_docspans(): - - _DocSpan = collections.namedtuple("DocSpan", ["start", "length"]) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'all' - assert doc_spans == SquadDataset.keep_relevant_docspans(doc_spans, tok_start_position, tok_end_position, mode) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = -1 - tok_end_position = -1 - - mode = 'only_positive' - - expected_doc_spans = [] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'only_positive' - - expected_doc_spans = [_DocSpan(start=0, length=5), _DocSpan(start=1, length=5)] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) - - doc_spans = [_DocSpan(start=start, length=5) for start in range(15)] - - tok_start_position = 1 - tok_end_position = 2 - - mode = 'limited_negative' - - expected_doc_spans = [_DocSpan(start=start, length=5) for start in range(10)] - assert expected_doc_spans == SquadDataset.keep_relevant_docspans( - doc_spans, tok_start_position, tok_end_position, mode - ) diff --git a/tests/collections/nlp/test_spellchecking_asr_customization.py b/tests/collections/nlp/test_spellchecking_asr_customization.py deleted file mode 100644 index 8e4d6e9a7b8f6..0000000000000 --- a/tests/collections/nlp/test_spellchecking_asr_customization.py +++ /dev/null @@ -1,1102 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest -from transformers import AutoTokenizer - -from nemo.collections.nlp.data.spellchecking_asr_customization.bert_example import BertExampleBuilder -from nemo.collections.nlp.data.spellchecking_asr_customization.utils import ( - apply_replacements_to_text, - substitute_replacements_in_text, -) - - -@pytest.mark.unit -def test_substitute_replacements_in_text(): - text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" - replacements = [(66, 75, 'pro-terra', 0.99986), (101, 109, 'navistar', 0.996)] - gold_text = "we began the further diversification of our revenue base with the pro-terra supply agreement and the navistar joint development agreement" - corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=False) - assert corrected_text == gold_text - - gold_text_no_hyphen = "we began the further diversification of our revenue base with the pro terra supply agreement and the navistar joint development agreement" - corrected_text = substitute_replacements_in_text(text, replacements, replace_hyphen_to_space=True) - assert corrected_text == gold_text_no_hyphen - - -@pytest.mark.unit -def test_apply_replacements_to_text(): - - # min_prob = 0.5 - # dp_data = None, - # min_dp_score_per_symbol: float = -99.9 - - # test more than one fragment to replace, test multiple same replacements - text = "we began the further diversification of our revenue base with the protterra supply agreement and the navastar joint development agreement" - replacements = [ - (66, 75, 'proterra', 0.99986), - (66, 75, 'proterra', 0.9956), - (101, 109, 'navistar', 0.93), - (101, 109, 'navistar', 0.91), - (101, 109, 'navistar', 0.92), - ] - gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navistar joint development agreement" - corrected_text = apply_replacements_to_text( - text, replacements, min_prob=0.5, replace_hyphen_to_space=False, dp_data=None - ) - assert corrected_text == gold_text - - # test that min_prob works - gold_text = "we began the further diversification of our revenue base with the proterra supply agreement and the navastar joint development agreement" - corrected_text = apply_replacements_to_text( - text, replacements, min_prob=0.95, replace_hyphen_to_space=False, dp_data=None - ) - assert corrected_text == gold_text - - -@pytest.fixture() -def bert_example_builder(): - tokenizer = AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_6L_768D") - label_map = {"0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5, "6": 6, "7": 7, "8": 8, "9": 9, "10": 10} - semiotic_classes = {"PLAIN": 0, "CUSTOM": 1} - max_seq_len = 256 - builder = BertExampleBuilder(label_map, semiotic_classes, tokenizer, max_seq_len) - return builder - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_creation(bert_example_builder): - assert bert_example_builder._tokenizer is not None - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_spans(bert_example_builder): - span_info_parts = ["CUSTOM 37 41", "CUSTOM 47 52", "CUSTOM 42 46", "CUSTOM 0 7"] - gold_sorted_spans = [(1, 1, 8), (1, 38, 42), (1, 43, 47), (1, 48, 53)] - spans = bert_example_builder._get_spans(span_info_parts) - spans.sort() - assert spans == gold_sorted_spans - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_fragment_indices(bert_example_builder): - hyp = "a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w" - targets = [1] - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 - span_info_parts = ["CUSTOM 8 17"] - gold_sorted_fragment_indices = [(7, 18, 1), (11, 18, 1)] - fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) - fragment_indices.sort() - assert fragment_indices == gold_sorted_fragment_indices - - # a b o u t _ o u r _ s h i p e r s _ b u t _ y o u _ k n o w - # 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 - span_info_parts = ["CUSTOM 10 16"] - gold_sorted_fragment_indices = [(11, 18, 1)] - fragment_indices = bert_example_builder._get_fragment_indices(hyp, targets, span_info_parts) - fragment_indices.sort() - assert fragment_indices == gold_sorted_fragment_indices - - -@pytest.mark.skip("Doesn't work download when testing on github, for unknown reason") -@pytest.mark.with_downloads -@pytest.mark.unit -def test_builder_get_input_features(bert_example_builder): - hyp = "a s t r o n o m e r s _ d i d i e _ s o m o n _ a n d _ t r i s t i a n _ g l l o" - ref = "d i d i e r _ s a u m o n;a s t r o n o m i e;t r i s t a n _ g u i l l o t;t r i s t e s s e;m o n a d e;c h r i s t i a n;a s t r o n o m e r;s o l o m o n;d i d i d i d i d i;m e r c y" - targets = [1, 3] - span_info_parts = ["CUSTOM 12 23", "CUSTOM 28 41"] - - gold_tags = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - ] - gold_input_ids = [ - 101, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1041, - 1054, - 1055, - 1035, - 1040, - 1045, - 1040, - 1045, - 1041, - 1035, - 1055, - 1051, - 1049, - 1051, - 1050, - 1035, - 1037, - 1050, - 1040, - 1035, - 1056, - 1054, - 1045, - 1055, - 1056, - 1045, - 1037, - 1050, - 1035, - 1043, - 1048, - 1048, - 1051, - 102, - 1040, - 1045, - 1040, - 1045, - 1041, - 1054, - 1035, - 1055, - 1037, - 1057, - 1049, - 1051, - 1050, - 102, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1045, - 1041, - 102, - 1056, - 1054, - 1045, - 1055, - 1056, - 1037, - 1050, - 1035, - 1043, - 1057, - 1045, - 1048, - 1048, - 1051, - 1056, - 102, - 1056, - 1054, - 1045, - 1055, - 1056, - 1041, - 1055, - 1055, - 1041, - 102, - 1049, - 1051, - 1050, - 1037, - 1040, - 1041, - 102, - 1039, - 1044, - 1054, - 1045, - 1055, - 1056, - 1045, - 1037, - 1050, - 102, - 1037, - 1055, - 1056, - 1054, - 1051, - 1050, - 1051, - 1049, - 1041, - 1054, - 102, - 1055, - 1051, - 1048, - 1051, - 1049, - 1051, - 1050, - 102, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 1040, - 1045, - 102, - 1049, - 1041, - 1054, - 1039, - 1061, - 102, - ] - gold_input_mask = [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - ] - gold_segment_ids = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 3, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 5, - 5, - 5, - 5, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 6, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 7, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 8, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 9, - 10, - 10, - 10, - 10, - 10, - 10, - ] - gold_labels_mask = [ - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - ] - gold_input_ids_for_subwords = [ - 101, - 26357, - 2106, - 2666, - 2061, - 8202, - 1998, - 13012, - 16643, - 2319, - 1043, - 7174, - 102, - 2106, - 3771, - 7842, - 2819, - 2239, - 102, - 28625, - 3630, - 9856, - 102, - 9822, - 26458, - 7174, - 2102, - 102, - 13012, - 13473, - 11393, - 102, - 13813, - 3207, - 102, - 3017, - 102, - 15211, - 102, - 9168, - 102, - 2106, - 28173, - 4305, - 4305, - 102, - 8673, - 102, - ] - gold_input_mask_for_subwords = [ - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - ] - gold_segment_ids_for_subwords = [ - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 2, - 3, - 3, - 3, - 3, - 3, - 4, - 4, - 4, - 4, - 5, - 5, - 5, - 6, - 6, - 7, - 7, - 8, - 8, - 9, - 9, - 9, - 9, - 9, - 10, - 10, - ] - gold_character_pos_to_subword_pos = [ - 0, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 2, - 2, - 2, - 3, - 3, - 3, - 4, - 4, - 5, - 5, - 5, - 5, - 6, - 6, - 6, - 6, - 7, - 7, - 7, - 8, - 8, - 8, - 9, - 9, - 9, - 10, - 11, - 11, - 11, - 12, - 13, - 13, - 13, - 14, - 14, - 14, - 14, - 15, - 15, - 16, - 16, - 17, - 17, - 18, - 19, - 19, - 19, - 19, - 19, - 20, - 20, - 21, - 21, - 21, - 22, - 23, - 23, - 23, - 23, - 23, - 23, - 23, - 23, - 24, - 24, - 24, - 25, - 25, - 25, - 26, - 27, - 28, - 28, - 28, - 29, - 29, - 29, - 30, - 30, - 30, - 31, - 32, - 32, - 32, - 32, - 33, - 33, - 34, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 35, - 36, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 37, - 38, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 40, - 41, - 41, - 41, - 42, - 42, - 42, - 43, - 43, - 44, - 44, - 45, - 46, - 46, - 46, - 46, - 46, - 47, - ] - - tags = [0 for _ in hyp.split()] - for p, t in zip(span_info_parts, targets): - c, start, end = p.split(" ") - start = int(start) - end = int(end) - tags[start:end] = [t for i in range(end - start)] - - # get input features for characters - (input_ids, input_mask, segment_ids, labels_mask, labels, _, _,) = bert_example_builder._get_input_features( - hyp=hyp, ref=ref, tags=tags - ) - - # get input features for words - hyp_with_words = hyp.replace(" ", "").replace("_", " ") - ref_with_words = ref.replace(" ", "").replace("_", " ") - ( - input_ids_for_subwords, - input_mask_for_subwords, - segment_ids_for_subwords, - _, - _, - _, - _, - ) = bert_example_builder._get_input_features(hyp=hyp_with_words, ref=ref_with_words, tags=None) - - character_pos_to_subword_pos = bert_example_builder._map_characters_to_subwords(input_ids, input_ids_for_subwords) - - assert tags == gold_tags - assert input_ids == gold_input_ids - assert input_mask == gold_input_mask - assert segment_ids == gold_segment_ids - assert labels_mask == gold_labels_mask - assert input_ids_for_subwords == gold_input_ids_for_subwords - assert input_mask_for_subwords == gold_input_mask_for_subwords - assert segment_ids_for_subwords == gold_segment_ids_for_subwords - assert character_pos_to_subword_pos == gold_character_pos_to_subword_pos diff --git a/tutorials/nlp/Dialogue.ipynb b/tutorials/nlp/Dialogue.ipynb deleted file mode 100644 index ddd3bdd4f9298..0000000000000 --- a/tutorials/nlp/Dialogue.ipynb +++ /dev/null @@ -1,717 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "jaosjY4rGRNH" - }, - "source": [ - "# Installing NeMo from source\n", - "\n", - "\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run the cell below to set up dependencies.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "goQzOSflEq27" - }, - "outputs": [], - "source": [ - "import os \n", - "BRANCH = 'main'\n", - "!apt-get update && apt-get install -y libsndfile1 ffmpeg\n", - "!git clone https://github.com/NVIDIA/NeMo --branch $BRANCH\n", - "os.chdir('NeMo')\n", - "!./reinstall.sh\n", - "os.chdir('..')\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GjQ_z_xQMDIb" - }, - "source": [ - "# Overview\n", - "\n", - "There are three tasks as part of this tutorial\n", - "\n", - "1. Intent and Slot Classification using Assistant Dataset and a BERT model\n", - "2. Intent Classification using Schema Guided Dialogue Dataset and a GPT2 model\n", - "3. Answer Extender using MS Marco NLGen Dataset and a BART model\n", - "\n", - "Feel free to skip to the task that interests you most after installing NeMo from source." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AS-zwy8tEq2_" - }, - "source": [ - "# 1. Intent and Slot Classification using Assistant Dataset\n", - "\n", - "## 1.1 Task Description\n", - "\n", - "**Joint Intent and Slot classification** - is a task of classifying an Intent and detecting all relevant Slots (Entities)\n", - "for this Intent in a query.\n", - "For example, in the query: `What is the weather in Santa Clara tomorrow morning?`, we would like to classify the query\n", - "as a `weather` Intent, and detect `Santa Clara` as a `location` slot and `tomorrow morning` as a `date_time` slot.\n", - "Intents and Slots names are usually task specific and defined as labels in the training data.\n", - "This is a fundamental step that is executed in any task-driven Conversational Assistant.\n", - "\n", - "Our model enables to train and then detect both of these tasks together.\n", - "\n", - "Note: There is a similar model available at [Joint Intent Slot Classification Colab](https://colab.research.google.com/github/NVIDIA/NeMo/blob/stable/tutorials/nlp/Joint_Intent_and_Slot_Classification.ipynb). However, this model only support BERT style models while the model in this tutorial supports other types of models such as GPT2. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FJk_UAyeEq3B" - }, - "source": [ - "\n", - "## 1.2 Download Assistant dataset and convert to NeMo format\n", - "\n", - "This is a virtual assistant interaction data set that can be downloaded from here: https://github.com/xliuhw/NLU-Evaluation-Data.\n", - "There are about 10K training and 1K testing queries which cover 64 various Intents and 55 Slots. \n", - "\n", - "An example is:\n", - "\n", - "* utterance: what alarms have i set for tomorrow \n", - "* intent: alarm_query\n", - "* slots: date(tomorrow)\n", - "\n", - "\n", - "Note: While only the assistant dataset is used here, import_dataset.py is also compatible with ATIS and SNIPS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jjOVdGX2Eq3D" - }, - "outputs": [], - "source": [ - "# download and unzip the example dataset from github\n", - "!wget https://github.com/xliuhw/NLU-Evaluation-Data/archive/master.zip\n", - "!unzip master.zip\n", - "# convert the dataset to the NeMo format\n", - "!python NeMo/scripts/dataset_processing/nlp/intent_and_slot/import_datasets.py --dataset_name=assistant --source_data_dir=./NLU-Evaluation-Data-master --target_data_dir=./assistant" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5n81deZsEq3G" - }, - "source": [ - "## 1.3 Training and/or Testing the model\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eoYc_8jhEq3G" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./assistant' \\\n", - " model.dataset.dialogues_example_dir='./assistant_bert_examples' \\\n", - " model.dataset.task='assistant' \\\n", - " model.language_model.pretrained_model_name='bert-base-uncased' \\\n", - " exp_manager.create_wandb_logger=False)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GaPmHjayEbg8" - }, - "source": [ - "**Results after 3 epochs**\n", - "\n", - "Intent report: \n", - "```\n", - " label precision recall f1 support \n", - " alarm_query (label_id: 0) 100.00 94.44 97.14 18\n", - " alarm_remove (label_id: 1) 100.00 90.91 95.24 11\n", - " alarm_set (label_id: 2) 94.12 94.12 94.12 17\n", - " audio_volume_down (label_id: 3) 75.00 42.86 54.55 7\n", - " audio_volume_mute (label_id: 4) 100.00 92.86 96.30 14\n", - " audio_volume_up (label_id: 5) 72.22 100.00 83.87 13\n", - " calendar_query (label_id: 6) 87.50 77.78 82.35 18\n", - " calendar_remove (label_id: 7) 94.44 100.00 97.14 17\n", - " calendar_set (label_id: 8) 94.44 94.44 94.44 18\n", - " cooking_recipe (label_id: 9) 85.71 70.59 77.42 17\n", - " datetime_convert (label_id: 10) 88.89 100.00 94.12 8\n", - " datetime_query (label_id: 11) 89.47 100.00 94.44 17\n", - " email_addcontact (label_id: 12) 80.00 100.00 88.89 8\n", - " email_query (label_id: 13) 100.00 83.33 90.91 18\n", - " email_querycontact (label_id: 14) 78.95 88.24 83.33 17\n", - " email_sendemail (label_id: 15) 94.44 94.44 94.44 18\n", - " general_affirm (label_id: 16) 100.00 100.00 100.00 17\n", - " general_commandstop (label_id: 17) 100.00 100.00 100.00 18\n", - " general_confirm (label_id: 18) 100.00 100.00 100.00 17\n", - " general_dontcare (label_id: 19) 100.00 100.00 100.00 18\n", - " general_explain (label_id: 20) 100.00 100.00 100.00 17\n", - " general_joke (label_id: 21) 91.67 100.00 95.65 11\n", - " general_negate (label_id: 22) 100.00 100.00 100.00 18\n", - " general_praise (label_id: 23) 100.00 100.00 100.00 17\n", - " general_quirky (label_id: 24) 60.00 50.00 54.55 18\n", - " general_repeat (label_id: 25) 100.00 100.00 100.00 17\n", - " iot_cleaning (label_id: 26) 100.00 100.00 100.00 15\n", - " iot_coffee (label_id: 27) 85.71 100.00 92.31 18\n", - " iot_hue_lightchange (label_id: 28) 100.00 94.12 96.97 17\n", - " iot_hue_lightdim (label_id: 29) 100.00 100.00 100.00 12\n", - " iot_hue_lightoff (label_id: 30) 100.00 100.00 100.00 17\n", - " iot_hue_lighton (label_id: 31) 100.00 50.00 66.67 4\n", - " iot_hue_lightup (label_id: 32) 84.62 91.67 88.00 12\n", - " iot_wemo_off (label_id: 33) 100.00 100.00 100.00 9\n", - " iot_wemo_on (label_id: 34) 100.00 85.71 92.31 7\n", - " lists_createoradd (label_id: 35) 90.00 100.00 94.74 18\n", - " lists_query (label_id: 36) 100.00 94.12 96.97 17\n", - " lists_remove (label_id: 37) 88.89 88.89 88.89 18\n", - " music_likeness (label_id: 38) 100.00 93.75 96.77 16\n", - " music_query (label_id: 39) 100.00 100.00 100.00 17\n", - " music_settings (label_id: 40) 77.78 100.00 87.50 7\n", - " news_query (label_id: 41) 72.73 88.89 80.00 18\n", - " play_audiobook (label_id: 42) 100.00 100.00 100.00 17\n", - " play_game (label_id: 43) 93.75 83.33 88.24 18\n", - " play_music (label_id: 44) 85.00 100.00 91.89 17\n", - " play_podcasts (label_id: 45) 100.00 88.89 94.12 18\n", - " play_radio (label_id: 46) 84.21 94.12 88.89 17\n", - " qa_currency (label_id: 47) 85.00 94.44 89.47 18\n", - " qa_definition (label_id: 48) 89.47 100.00 94.44 17\n", - " qa_factoid (label_id: 49) 64.00 88.89 74.42 18\n", - " qa_maths (label_id: 50) 84.62 84.62 84.62 13\n", - " qa_stock (label_id: 51) 87.50 77.78 82.35 18\n", - " recommendation_events (label_id: 52) 87.50 82.35 84.85 17\n", - " recommendation_locations (label_id: 53) 83.33 83.33 83.33 18\n", - " recommendation_movies (label_id: 54) 100.00 60.00 75.00 10\n", - " social_post (label_id: 55) 100.00 94.12 96.97 17\n", - " social_query (label_id: 56) 100.00 82.35 90.32 17\n", - " takeaway_order (label_id: 57) 92.31 70.59 80.00 17\n", - " takeaway_query (label_id: 58) 93.75 83.33 88.24 18\n", - " transport_query (label_id: 59) 81.25 76.47 78.79 17\n", - " transport_taxi (label_id: 60) 100.00 100.00 100.00 16\n", - " transport_ticket (label_id: 61) 85.00 94.44 89.47 18\n", - " transport_traffic (label_id: 62) 93.75 88.24 90.91 17\n", - " weather_query (label_id: 63) 89.47 100.00 94.44 17\n", - " -------------------\n", - " micro avg 91.16 91.16 91.16 996\n", - " macro avg 91.66 90.44 90.48 996\n", - " weighted avg 91.72 91.16 91.04 996\n", - "```\n", - "Slot report: \n", - "```\n", - " label precision recall f1 support \n", - " alarm_type (label_id: 0) 0.00 0.00 0.00 2\n", - " app_name (label_id: 1) 0.00 0.00 0.00 1\n", - " artist_name (label_id: 2) 17.39 80.00 28.57 5\n", - " audiobook_author (label_id: 3) 0.00 0.00 0.00 0\n", - " audiobook_name (label_id: 4) 64.52 74.07 68.97 27\n", - " business_name (label_id: 5) 81.48 84.62 83.02 52\n", - " business_type (label_id: 6) 80.00 80.00 80.00 20\n", - " change_amount (label_id: 7) 57.14 66.67 61.54 6\n", - " coffee_type (label_id: 8) 100.00 33.33 50.00 3\n", - " color_type (label_id: 9) 75.00 92.31 82.76 13\n", - " cooking_type (label_id: 10) 0.00 0.00 0.00 1\n", - " currency_name (label_id: 11) 100.00 96.43 98.18 28\n", - " date (label_id: 12) 87.88 87.22 87.55 133\n", - " definition_word (label_id: 13) 85.00 85.00 85.00 20\n", - " device_type (label_id: 14) 84.75 76.92 80.65 65\n", - " drink_type (label_id: 15) 0.00 0.00 0.00 0\n", - " email_address (label_id: 16) 64.29 100.00 78.26 9\n", - " email_folder (label_id: 17) 100.00 50.00 66.67 2\n", - " event_name (label_id: 18) 80.00 75.00 77.42 64\n", - " food_type (label_id: 19) 84.38 77.14 80.60 35\n", - " game_name (label_id: 20) 93.55 78.38 85.29 37\n", - " game_type (label_id: 21) 0.00 0.00 0.00 0\n", - " general_frequency (label_id: 22) 0.00 0.00 0.00 9\n", - " house_place (label_id: 23) 80.95 91.89 86.08 37\n", - " ingredient (label_id: 24) 0.00 0.00 0.00 1\n", - " joke_type (label_id: 25) 100.00 100.00 100.00 5\n", - " list_name (label_id: 26) 89.29 69.44 78.12 36\n", - " meal_type (label_id: 27) 0.00 0.00 0.00 3\n", - " media_type (label_id: 28) 78.95 83.33 81.08 36\n", - " movie_name (label_id: 29) 0.00 0.00 0.00 1\n", - " movie_type (label_id: 30) 0.00 0.00 0.00 0\n", - " music_album (label_id: 31) 0.00 0.00 0.00 0\n", - " music_descriptor (label_id: 32) 0.00 0.00 0.00 2\n", - " music_genre (label_id: 33) 81.82 90.00 85.71 10\n", - " news_topic (label_id: 34) 80.00 30.77 44.44 13\n", - " order_type (label_id: 35) 100.00 42.11 59.26 19\n", - " person (label_id: 36) 70.79 100.00 82.89 63\n", - " personal_info (label_id: 37) 76.19 94.12 84.21 17\n", - " place_name (label_id: 38) 82.86 84.47 83.65 103\n", - " player_setting (label_id: 39) 75.00 42.86 54.55 7\n", - " playlist_name (label_id: 40) 0.00 0.00 0.00 3\n", - " podcast_descriptor (label_id: 41) 92.31 54.55 68.57 22\n", - " podcast_name (label_id: 42) 66.67 16.67 26.67 12\n", - " radio_name (label_id: 43) 94.87 94.87 94.87 39\n", - " relation (label_id: 44) 90.91 90.91 90.91 11\n", - " song_name (label_id: 45) 100.00 6.67 12.50 15\n", - " time (label_id: 46) 77.57 84.69 80.98 98\n", - " time_zone (label_id: 47) 44.44 100.00 61.54 4\n", - " timeofday (label_id: 48) 86.96 80.00 83.33 25\n", - " transport_agency (label_id: 49) 80.00 57.14 66.67 7\n", - " transport_descriptor (label_id: 50) 0.00 0.00 0.00 5\n", - " transport_name (label_id: 51) 0.00 0.00 0.00 0\n", - " transport_type (label_id: 52) 88.89 100.00 94.12 40\n", - " weather_descriptor (label_id: 53) 87.50 87.50 87.50 8\n", - " O (label_id: 54) 97.07 97.52 97.30 5408\n", - " -------------------\n", - " micro avg 94.24 94.24 94.24 6582\n", - " macro avg 64.87 59.93 59.17 6582\n", - " weighted avg 94.23 94.24 93.95 6582\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-44x5PqyrOeQ" - }, - "source": [ - "## 1.4 (Optional) To train/ test a GPT2 model on the assistant dataset, run the cell below " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "QyqQbpR4rNHT" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n", - "# model.dataset.target_template=with_slots: this perform slot filling with intent classification\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./assistant' \\\n", - " model.dataset.dialogues_example_dir='./assistant_gpt2_examples' \\\n", - " model.dataset.task='assistant' \\\n", - " model.language_model.pretrained_model_name='gpt2' \\\n", - " trainer.max_epochs=1 \\\n", - " model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n", - " model.dataset.target_template=with_slots \\\n", - " model.dataset.eval_mode=generation \\\n", - " exp_manager.create_wandb_logger=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "FbQ-6TVM1yQg" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "More epochs would be helpful\n", - "\n", - "Intent report:\n", - "\n", - " ```\n", - " label precision recall f1 support \n", - " transport query (label_id: 0) 72.73 84.21 78.05 19\n", - " weather query (label_id: 1) 94.74 94.74 94.74 19\n", - " play game (label_id: 2) 92.86 68.42 78.79 19\n", - " qa currency (label_id: 3) 100.00 100.00 100.00 19\n", - " qa maths (label_id: 4) 100.00 100.00 100.00 14\n", - " iot wemo off (label_id: 5) 75.00 100.00 85.71 9\n", - " datetime convert (label_id: 6) 46.67 87.50 60.87 8\n", - " email addcontact (label_id: 7) 70.00 87.50 77.78 8\n", - " music likeness (label_id: 8) 57.89 61.11 59.46 18\n", - " music query (label_id: 9) 78.57 57.89 66.67 19\n", - " general negate (label_id: 10) 95.00 100.00 97.44 19\n", - " email sendemail (label_id: 11) 92.86 68.42 78.79 19\n", - " general affirm (label_id: 12) 95.00 100.00 97.44 19\n", - " play audiobook (label_id: 13) 57.69 78.95 66.67 19\n", - " general praise (label_id: 14) 100.00 94.74 97.30 19\n", - " alarm set (label_id: 15) 85.71 94.74 90.00 19\n", - " general explain (label_id: 16) 100.00 89.47 94.44 19\n", - " iot wemo on (label_id: 17) 83.33 71.43 76.92 7\n", - " cooking recipe (label_id: 18) 90.00 94.74 92.31 19\n", - " music settings (label_id: 19) 60.00 42.86 50.00 7\n", - " social post (label_id: 20) 84.21 84.21 84.21 19\n", - " recommendation events (label_id: 21) 72.73 84.21 78.05 19\n", - " audio volume up (label_id: 22) 76.47 100.00 86.67 13\n", - " lists remove (label_id: 23) 73.08 100.00 84.44 19\n", - " transport ticket (label_id: 24) 94.74 94.74 94.74 19\n", - " general joke (label_id: 25) 100.00 100.00 100.00 12\n", - " play podcasts (label_id: 26) 94.12 84.21 88.89 19\n", - " iot hue lightchange (label_id: 27) 85.71 63.16 72.73 19\n", - " audio volume mute (label_id: 28) 84.62 73.33 78.57 15\n", - " general dontcare (label_id: 29) 95.00 100.00 97.44 19\n", - " qa definition (label_id: 30) 77.27 89.47 82.93 19\n", - " email querycontact (label_id: 31) 58.33 73.68 65.12 19\n", - " general commandstop (label_id: 32) 100.00 100.00 100.00 19\n", - " calendar remove (label_id: 33) 94.44 89.47 91.89 19\n", - " news query (label_id: 34) 100.00 57.89 73.33 19\n", - " calendar query (label_id: 35) 63.16 63.16 63.16 19\n", - " social query (label_id: 36) 88.24 83.33 85.71 18\n", - " transport traffic (label_id: 37) 90.48 100.00 95.00 19\n", - " transport taxi (label_id: 38) 100.00 94.44 97.14 18\n", - " alarm query (label_id: 39) 100.00 94.74 97.30 19\n", - " iot hue lightoff (label_id: 40) 88.89 84.21 86.49 19\n", - " takeaway order (label_id: 41) 81.25 68.42 74.29 19\n", - " iot coffee (label_id: 42) 100.00 94.74 97.30 19\n", - " recommendation movies (label_id: 43) 75.00 90.00 81.82 10\n", - " iot hue lightup (label_id: 44) 78.57 78.57 78.57 14\n", - " email query (label_id: 45) 85.71 94.74 90.00 19\n", - " lists createoradd (label_id: 46) 82.35 73.68 77.78 19\n", - " play radio (label_id: 47) 84.21 84.21 84.21 19\n", - " audio volume down (label_id: 48) 100.00 87.50 93.33 8\n", - " general quirky (label_id: 49) 30.00 15.79 20.69 19\n", - " play music (label_id: 50) 71.43 52.63 60.61 19\n", - " qa stock (label_id: 51) 90.48 100.00 95.00 19\n", - " iot cleaning (label_id: 52) 93.33 87.50 90.32 16\n", - " iot hue lightdim (label_id: 53) 100.00 100.00 100.00 12\n", - " recommendation locations (label_id: 54) 100.00 89.47 94.44 19\n", - " general repeat (label_id: 55) 100.00 100.00 100.00 19\n", - " takeaway query (label_id: 56) 77.27 89.47 82.93 19\n", - " alarm remove (label_id: 57) 100.00 100.00 100.00 11\n", - " datetime query (label_id: 58) 75.00 63.16 68.57 19\n", - " iot hue lighton (label_id: 59) 60.00 100.00 75.00 3\n", - " qa factoid (label_id: 60) 50.00 57.89 53.66 19\n", - " calendar set (label_id: 61) 75.00 78.95 76.92 19\n", - " general confirm (label_id: 62) 100.00 100.00 100.00 19\n", - " lists query (label_id: 63) 66.67 73.68 70.00 19\n", - " label_id: 64 0.00 0.00 0.00 0\n", - " -------------------\n", - " micro avg 83.55 83.55 83.55 1076\n", - " macro avg 83.53 83.93 83.01 1076\n", - " weighted avg 84.26 83.55 83.30 1076\n", - " \n", - "```\n", - "\n", - "```\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " Test metric DataLoader 0\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " intent_f1 83.55018615722656\n", - " intent_precision 83.55018615722656\n", - " intent_recall 83.55018615722656\n", - " slot_f1 73.99985919756773\n", - "slot_joint_goal_accuracy 65.89219330855019\n", - " slot_precision 73.85223048327137\n", - " slot_recall 74.14807930607186\n", - " test_intent_accuracy 83.55018587360595\n", - " test_loss_epoch 0.019178826361894608\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gd42arYoEq3J" - }, - "source": [ - "# 2. Schema Guided Dialogue (SGD)\n", - "\n", - "## 2.1 Task Description\n", - "---\n", - "\n", - "SGD is a multi-domain intent classification dataset from Google with close to 100k examples.\n", - "\n", - "An example is:\n", - "\n", - "* utterance: I will be eating there at 11:30 am so make the reservation for then.\n", - "* intent: ReserveRestaurant\n", - "* slots: {\"time\": \"11:30 am\"}\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "neH8rXwjEq3J" - }, - "source": [ - "## 2.2 Download the dataset" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "IgD8eavfJ5pi" - }, - "outputs": [], - "source": [ - "!git clone https://github.com/google-research-datasets/dstc8-schema-guided-dialogue.git" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "7G7uPrUpEq3J" - }, - "source": [ - "## 2.3 Training and/or Testing the model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gqo-rwQlEq3K" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "# model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\": gpt2 doesn't specify a pad token, therefore using its EOS token as the pad token\n", - "\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.data_dir='./dstc8-schema-guided-dialogue' \\\n", - " model.dataset.dialogues_example_dir='./sgd_gpt2_predictions' \\\n", - " model.dataset.task='sgd' \\\n", - " model.language_model.pretrained_model_name='gpt2' \\\n", - " trainer.max_epochs=1 \\\n", - " model.tokenizer.special_tokens=\"{pad_token:'<|endoftext|>'}\" \\\n", - " exp_manager.create_wandb_logger=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kGDlV5HvI2PQ" - }, - "outputs": [], - "source": [ - "!ls sgd_gpt2_predictions" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "p8g0f5KDTu9K" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "More epochs would needed to reach convergence.\n", - "\n", - "\n", - "```\n", - " label precision recall f1 support \n", - " check balance (label_id: 0) 0.00 0.00 0.00 0\n", - " find trains (label_id: 1) 80.20 91.95 85.68 348\n", - " make payment (label_id: 2) 83.12 28.07 41.97 228\n", - " book appointment (label_id: 3) 86.93 87.15 87.04 397\n", - " get cars available (label_id: 4) 96.88 90.51 93.58 274\n", - " get event dates (label_id: 5) 0.00 0.00 0.00 0\n", - " buy bus ticket (label_id: 6) 78.61 91.33 84.49 173\n", - " add event (label_id: 7) 0.00 0.00 0.00 0\n", - " get alarms (label_id: 8) 58.33 77.78 66.67 45\n", - " reserve car (label_id: 9) 83.75 72.43 77.68 185\n", - " get events (label_id: 10) 0.00 0.00 0.00 0\n", - " reserve roundtrip flights (label_id: 11) 0.00 0.00 0.00 0\n", - " lookup music (label_id: 12) 89.83 86.89 88.33 61\n", - " book house (label_id: 13) 91.13 92.50 91.81 200\n", - " search oneway flight (label_id: 14) 74.77 47.70 58.25 174\n", - " buy event tickets (label_id: 15) 72.19 95.31 82.15 128\n", - " find apartment (label_id: 16) 0.00 0.00 0.00 0\n", - " schedule visit (label_id: 17) 77.27 66.06 71.23 386\n", - " play media (label_id: 18) 92.94 86.81 89.77 91\n", - " get ride (label_id: 19) 99.41 98.82 99.12 170\n", - " reserve oneway flight (label_id: 20) 0.00 0.00 0.00 0\n", - " find bus (label_id: 21) 96.64 87.53 91.86 361\n", - " find restaurants (label_id: 22) 77.14 91.22 83.59 148\n", - " get times for movie (label_id: 23) 0.00 0.00 0.00 0\n", - " transfer money (label_id: 24) 0.00 0.00 0.00 0\n", - " request payment (label_id: 25) 46.71 63.39 53.79 112\n", - " play movie (label_id: 26) 100.00 65.11 78.87 321\n", - " search house (label_id: 27) 97.91 91.83 94.77 306\n", - " search roundtrip flights (label_id: 28) 67.49 82.41 74.21 199\n", - " find provider (label_id: 29) 95.11 90.53 92.77 602\n", - " find attractions (label_id: 30) 100.00 89.01 94.19 91\n", - " reserve hotel (label_id: 31) 56.75 97.04 71.62 169\n", - " lookup song (label_id: 32) 0.00 0.00 0.00 0\n", - " add alarm (label_id: 33) 95.68 60.18 73.89 221\n", - " find home by area (label_id: 34) 48.95 59.79 53.83 194\n", - " get available time (label_id: 35) 0.00 0.00 0.00 0\n", - " buy movie tickets (label_id: 36) 100.00 29.39 45.42 473\n", - " reserve restaurant (label_id: 37) 95.71 84.80 89.92 342\n", - " find movies (label_id: 38) 62.40 97.61 76.14 335\n", - " get weather (label_id: 39) 100.00 87.69 93.44 195\n", - " search hotel (label_id: 40) 99.35 52.60 68.78 289\n", - " find events (label_id: 41) 99.57 82.56 90.27 281\n", - " play song (label_id: 42) 0.00 0.00 0.00 0\n", - " rent movie (label_id: 43) 0.00 0.00 0.00 0\n", - " get train tickets (label_id: 44) 45.83 5.56 9.91 198\n", - " none (label_id: 45) 55.77 98.90 71.32 728\n", - " label_id: 46 0.00 0.00 0.00 0\n", - " -------------------\n", - " micro avg 77.23 77.23 77.23 8425\n", - " macro avg 82.01 76.68 76.56 8425\n", - " weighted avg 83.23 77.23 76.86 8425\n", - "\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jUJb-9VLLBXo" - }, - "source": [ - "# 3. MS Marco\n", - "\n", - "## Task Description\n", - "\n", - "MS Marco NLGen is a dataset from Microsoft that takes extracted answers and questions and output fluent answers.\n", - "\n", - "An example is \n", - "\n", - "\n", - "* question: What county is Nine Mile in?\n", - "* extracted_answer: Onondaga\n", - "* fluent_answer: Nine Mile is in Onondaga county.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VtXEKG_UQU9u" - }, - "source": [ - "## Download and unzip files" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "b9avsZ1CEq3K" - }, - "outputs": [], - "source": [ - "!mkdir ms_marco\n", - "os.chdir('ms_marco')\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz\n", - "\n", - "!gunzip train_v2.1.json.gz\n", - "!gunzip dev_v2.1.json.gz\n", - "\n", - "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename train_v2.1.json \n", - "!python ../NeMo/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py --filename dev_v2.1.json \n", - "\n", - "os.chdir('..')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "h7UZ9R8gQTFo" - }, - "source": [ - "## Training and/or Testing the model\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "fwGQCwbvRf2m" - }, - "outputs": [], - "source": [ - "# model.dataset.data_dir: folder to load data from\n", - "# model.dataset.dialogues_example_dir: folder that stores predictions for each sample\n", - "\n", - "!(python NeMo/examples/nlp/dialogue/dialogue.py \\\n", - " do_training=True \\\n", - " model.dataset.dialogues_example_dir='./marco_bart_predictions' \\\n", - " model.dataset.data_dir='./ms_marco' \\\n", - " model.save_model=True \\\n", - " model.dataset.debug_mode=True \\\n", - " model.dataset.task='ms_marco' \\\n", - " model.language_model.pretrained_model_name='facebook/bart-base' \\\n", - " trainer.max_epochs=1 \\\n", - " model.dataset.debug_mode=False \\\n", - " exp_manager.create_wandb_logger=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UL7ekAOZ2abi" - }, - "source": [ - "**After 1 epoch:**\n", - "\n", - "Train more epochs for optimal performance\n", - "\n", - "```\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " Test metric DataLoader 0\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - " bleu 65.46179962158203\n", - " f1 78.24439835896995\n", - " precision 81.92473076099847\n", - " recall 76.72508929408436\n", - " test_accuracy 25.563487607283225\n", - " test_loss 0.4419259166606655\n", - " test_loss_epoch 0.4420809745788574\n", - " test_ppl 1.5557004846779854\n", - "────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────\n", - "```" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Dialogue.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.7" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/nlp/Entity_Linking_Medical.ipynb b/tutorials/nlp/Entity_Linking_Medical.ipynb deleted file mode 100644 index dfdf594e6804f..0000000000000 --- a/tutorials/nlp/Entity_Linking_Medical.ipynb +++ /dev/null @@ -1,632 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "\n", - "## Install NeMo if using google collab or if its not installed locally\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[all]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "## Install dependencies\n", - "!pip install wget\n", - "!pip install faiss-gpu" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import faiss\n", - "import torch\n", - "import wget\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from omegaconf import OmegaConf\n", - "from pytorch_lightning import Trainer\n", - "from IPython.display import display\n", - "from tqdm import tqdm\n", - "\n", - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity Linking" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Task Description\n", - "[Entity linking](https://en.wikipedia.org/wiki/Entity_linking) is the process of connecting concepts mentioned in natural language to their canonical forms stored in a knowledge base. For example, say a knowledge base contained the entity 'ID3452 influenza' and we wanted to process some natural language containing the sentence \"The patient has flu like symptoms\". An entity linking model would match the word 'flu' to the knowledge base entity 'ID3452 influenza', allowing for disambiguation and normalization of concepts referenced in text. Entity linking applications range from helping automate data ingestion to assisting in real time dialogue concept normalization. We will be focusing on entity linking in the medical domain for this demo, but the entity linking model, dataset, and training code within NVIDIA NeMo can be applied to other domains like finance and retail.\n", - "\n", - "Within NeMo and this tutorial we use the entity linking approach described in Liu et. al's NAACL 2021 \"[Self-alignment Pre-training for Biomedical Entity Representations](https://arxiv.org/abs/2010.11784v2)\". The main idea behind this approach is to reshape an initial concept embedding space such that synonyms of the same concept are pulled closer together and unrelated concepts are pushed further apart. The concept embeddings from this reshaped space can then be used to build a knowledge base embedding index. This index stores concept IDs mapped to their respective concept embeddings in a format conducive to efficient nearest neighbor search. We can link query concepts to their canonical forms in the knowledge base by performing a nearest neighbor search- matching concept query embeddings to the most similar concepts embeddings in the knowledge base index. \n", - "\n", - "In this tutorial we will be using the [faiss](https://github.com/facebookresearch/faiss) library to build our concept index." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Self Alignment Pretraining\n", - "Self-Alignment pretraining is a second stage pretraining of an existing encoder (called second stage because the encoder model can be further finetuned after this more general pretraining step). The dataset used during training consists of pairs of concept synonyms that map to the same ID. At each training iteration, we only select *hard* examples present in the mini batch to calculate the loss and update the model weights. In this context, a hard example is an example where a concept is closer to an unrelated concept in the mini batch than it is to the synonym concept it is paired with by some margin. I encourage you to take a look at [section 2 of the paper](https://arxiv.org/pdf/2010.11784.pdf) for a more formal and in depth description of how hard examples are selected.\n", - "\n", - "We then use a [metric learning loss](https://openaccess.thecvf.com/content_CVPR_2019/papers/Wang_Multi-Similarity_Loss_With_General_Pair_Weighting_for_Deep_Metric_Learning_CVPR_2019_paper.pdf) calculated from the hard examples selected. This loss helps reshape the embedding space. The concept representation space is rearranged to be more suitable for entity matching via embedding cosine similarity. \n", - "\n", - "Now that we have idea of what's going on, let's get started!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Dataset Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Download data into project directory\n", - "PROJECT_DIR = \".\" #Change if you don't want the current directory to be the project dir\n", - "DATA_DIR = os.path.join(PROJECT_DIR, \"tiny_example_data\")\n", - "\n", - "if not os.path.isdir(os.path.join(DATA_DIR)):\n", - " wget.download('https://dldata-public.s3.us-east-2.amazonaws.com/tiny_example_data.zip',\n", - " os.path.join(PROJECT_DIR, \"tiny_example_data.zip\"))\n", - "\n", - " !unzip {PROJECT_DIR}/tiny_example_data.zip -d {PROJECT_DIR}" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this tutorial we will be using a tiny toy dataset to demonstrate how to use NeMo's entity linking model functionality. The dataset includes synonyms for 12 medical concepts. Entity phrases with the same ID are synonyms for the same concept. For example, \"*chronic kidney failure*\", \"*gradual loss of kidney function*\", and \"*CKD*\" are all synonyms of concept ID 5. Here's the dataset before preprocessing:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw_data = pd.read_csv(os.path.join(DATA_DIR, \"tiny_example_dev_data.csv\"), names=[\"ID\", \"CONCEPT\"], index_col=False)\n", - "print(raw_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We've already paired off the concepts for this dataset with the format `ID concept_synonym1 concept_synonym2`. Here are the first ten rows:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "training_data = pd.read_table(os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\"), names=[\"ID\", \"CONCEPT_SYN1\", \"CONCEPT_SYN2\"], delimiter='\\t')\n", - "print(training_data.head(10))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the [Unified Medical Language System (UMLS)](https://www.nlm.nih.gov/research/umls/index.html) dataset for full medical domain entity linking training. The data contains over 9 million entities and is a table of medical concepts with their corresponding concept IDs (CUI). After [requesting a free license and making a UMLS Terminology Services (UTS) account](https://www.nlm.nih.gov/research/umls/index.html), the [entire UMLS dataset](https://www.nlm.nih.gov/research/umls/licensedcontent/umlsknowledgesources.html) can be downloaded from the NIH's website. If you've cloned the NeMo repo you can run the data processing script located in `examples/nlp/entity_linking/data/umls_dataset_processing.py` on the full dataset. This script will take in the initial table of UMLS concepts and produce a .tsv file with each row formatted as `CUI\\tconcept_synonym1\\tconcept_synonym2`. Once the UMLS dataset .RRF file is downloaded, the script can be run from the `examples/nlp/entity_linking` directory like so: \n", - "```\n", - "python data/umls_dataset_processing.py\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Second stage pretrain a BERT Base encoder on the self-alignment pretraining task (SAP) for improved entity linking. Using a GPU, the model should take 5 minutes or less to train on this example dataset and training progress will be output below the cell." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "#Download config\n", - "wget.download(f\"https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/entity_linking/conf/tiny_example_entity_linking_config.yaml\",\n", - " os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Load in config file\n", - "cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Set config file variables\n", - "cfg.project_dir = PROJECT_DIR\n", - "cfg.model.nemo_path = os.path.join(PROJECT_DIR, \"tiny_example_sap_bert_model.nemo\")\n", - "cfg.model.train_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_train_pairs.tsv\")\n", - "cfg.model.validation_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_validation_pairs.tsv\")\n", - "\n", - "# remove distributed training flags\n", - "cfg.trainer.strategy = 'auto'\n", - "cfg.trainer.accelerator = 'auto'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Initialize the trainer and model\n", - "trainer = Trainer(**cfg.trainer)\n", - "exp_manager(trainer, cfg.get(\"exp_manager\", None))\n", - "model = nemo_nlp.models.EntityLinkingModel(cfg=cfg.model, trainer=trainer)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Train and save the model\n", - "trainer.fit(model)\n", - "model.save_to(cfg.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can run the script at `examples/nlp/entity_linking/self_alignment_pretraining.py` to train a model on a larger dataset. Run\n", - "\n", - "```\n", - "python self_alignment_pretraining.py project_dir=.\n", - "```\n", - "from the `examples/nlp/entity_linking` directory." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Model Evaluation\n", - "\n", - "Let's evaluate our freshly trained model and compare its performance with a BERT Base encoder that hasn't undergone self-alignment pretraining. We first need to restore our trained model and load our BERT Base Baseline model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", - "\n", - "# Restore second stage pretrained model\n", - "sap_model_cfg = cfg\n", - "sap_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_index\")\n", - "sap_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n", - "sap_model = nemo_nlp.models.EntityLinkingModel.restore_from(sap_model_cfg.model.nemo_path).to(device)\n", - "\n", - "# Load original model\n", - "base_model_cfg = OmegaConf.load(os.path.join(PROJECT_DIR, \"tiny_example_entity_linking_config.yaml\"))\n", - "\n", - "# Set train/val datasets to None to avoid loading datasets associated with training\n", - "base_model_cfg.model.train_ds = None\n", - "base_model_cfg.model.validation_ds = None\n", - "base_model_cfg.index.index_save_name = os.path.join(PROJECT_DIR, \"base_model_index\")\n", - "base_model_cfg.index.index_ds.data_file = os.path.join(DATA_DIR, \"tiny_example_index_data.tsv\")\n", - "base_model = nemo_nlp.models.EntityLinkingModel(base_model_cfg.model).to(device)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We are going evaluate our model on a nearest neighbor task using top 1 and top 5 accuracies as our metric. We will be using a tiny example test knowledge base and test queries. For this evaluation we are going to be comparing every test query with every concept vector in our test set knowledge base. We will rank each item in the knowledge base by its cosine similarity with the test query. We'll then compare the IDs of the predicted most similar test knowledge base concepts with our ground truth query IDs to calculate top 1 and top 5 accuracies. For this metric higher is better." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Helper function to get data embeddings\n", - "def get_embeddings(model, dataloader):\n", - " embeddings, cids = [], []\n", - "\n", - " with torch.no_grad():\n", - " for batch in tqdm(dataloader):\n", - " input_ids, token_type_ids, attention_mask, batch_cids = batch\n", - " batch_embeddings = model.forward(input_ids=input_ids.to(device), \n", - " token_type_ids=token_type_ids.to(device), \n", - " attention_mask=attention_mask.to(device))\n", - "\n", - " # Accumulate index embeddings and their corresponding IDs\n", - " embeddings.extend(batch_embeddings.cpu().detach().numpy())\n", - " cids.extend(batch_cids)\n", - " \n", - " return embeddings, cids" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate(model, test_kb, test_queries, ks):\n", - " # Initialize knowledge base and query data loaders\n", - " test_kb_dataloader = model.setup_dataloader(test_kb, is_index_data=True)\n", - " test_query_dataloader = model.setup_dataloader(test_queries, is_index_data=True)\n", - " \n", - " # Get knowledge base and query embeddings\n", - " test_kb_embs, test_kb_cids = get_embeddings(model, test_kb_dataloader)\n", - " test_query_embs, test_query_cids = get_embeddings(model, test_query_dataloader)\n", - "\n", - " # Calculate the cosine distance between each query and knowledge base concept\n", - " score_matrix = np.matmul(np.array(test_query_embs), np.array(test_kb_embs).T)\n", - " accs = {k : 0 for k in ks}\n", - " \n", - " # Compare the knowledge base IDs of the knowledge base entities with \n", - " # the smallest cosine distance from the query \n", - " for query_idx in tqdm(range(len(test_query_cids))):\n", - " query_emb = test_query_embs[query_idx]\n", - " query_cid = test_query_cids[query_idx]\n", - " query_scores = score_matrix[query_idx]\n", - "\n", - " for k in ks:\n", - " topk_idxs = np.argpartition(query_scores, -k)[-k:]\n", - " topk_cids = [test_kb_cids[idx] for idx in topk_idxs]\n", - " \n", - " # If the correct query ID is among the top k closest kb IDs\n", - " # the model correctly linked the entity\n", - " match = int(query_cid in topk_cids)\n", - " accs[k] += match\n", - "\n", - " for k in ks:\n", - " accs[k] /= len(test_query_cids)\n", - " \n", - " return accs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Create configs for our test data\n", - "test_kb = OmegaConf.create({\n", - " \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_kb.tsv\"),\n", - " \"max_seq_length\": 128,\n", - " \"batch_size\": 10,\n", - " \"shuffle\": False,\n", - "})\n", - "\n", - "test_queries = OmegaConf.create({\n", - " \"data_file\": os.path.join(DATA_DIR, \"tiny_example_test_queries.tsv\"),\n", - " \"max_seq_length\": 128,\n", - " \"batch_size\": 10,\n", - " \"shuffle\": False,\n", - "})\n", - "\n", - "ks = [1, 5]\n", - "\n", - "# Evaluate both models on our test data\n", - "base_accs = evaluate(base_model, test_kb, test_queries, ks)\n", - "base_accs[\"Model\"] = \"BERT Base Baseline\"\n", - "\n", - "sap_accs = evaluate(sap_model, test_kb, test_queries, ks)\n", - "sap_accs[\"Model\"] = \"BERT + SAP\"\n", - "\n", - "print(\"Top 1 and Top 5 Accuracy Comparison:\")\n", - "results_df = pd.DataFrame([base_accs, sap_accs], columns=[\"Model\", 1, 5])\n", - "results_df = results_df.style.set_properties(**{'text-align': 'left', }).set_table_styles([dict(selector='th', props=[('text-align', 'left')])])\n", - "display(results_df)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The purpose of this section was to show an example of evaluating your entity linking model. This evaluation set contains very little data, and no serious conclusions should be drawn about model performance. Top 1 accuracy should be between 0.7 and 1.0 for both models and top 5 accuracy should be between 0.8 and 1.0. When evaluating a model trained on a larger dataset, you can use a nearest neighbors index to speed up the evaluation time." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Building an Index" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To qualitatively observe the improvement we gain from the second stage pretraining, let's build two indices. One will be built with BERT base embeddings before self-alignment pretraining and one will be built with the model we just trained. Our knowledge base in this tutorial will be in the same domain and have some overlapping concepts as the training set. This data file is formatted as `ID\\tconcept`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The `EntityLinkingDataset` class can load the data used for training the entity linking encoder as well as for building the index if the `is_index_data` flag is set to true. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def build_index(cfg, model):\n", - " # Setup index dataset loader\n", - " index_dataloader = model.setup_dataloader(cfg.index.index_ds, is_index_data=True)\n", - " \n", - " # Get index dataset embeddings\n", - " embeddings, _ = get_embeddings(model, index_dataloader)\n", - " \n", - " # Train IVFFlat index using faiss\n", - " embeddings = np.array(embeddings)\n", - " quantizer = faiss.IndexFlatL2(cfg.index.dims)\n", - " index = faiss.IndexIVFFlat(quantizer, cfg.index.dims, cfg.index.nlist)\n", - " index = faiss.index_cpu_to_all_gpus(index)\n", - " index.train(embeddings)\n", - " \n", - " # Add concept embeddings to index\n", - " for i in tqdm(range(0, embeddings.shape[0], cfg.index.index_batch_size)):\n", - " index.add(embeddings[i:i+cfg.index.index_batch_size])\n", - "\n", - " # Save index\n", - " faiss.write_index(faiss.index_gpu_to_cpu(index), cfg.index.index_save_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "build_index(sap_model_cfg, sap_model.to(device))\n", - "build_index(base_model_cfg, base_model.to(device))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Entity Linking via Nearest Neighbor Search" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now it's time to query our indices! We are going to query both our index built with embeddings from BERT Base, and our index with embeddings built from the SAP BERT model we trained. Our sample query phrases will be \"*high blood sugar*\" and \"*head pain*\". \n", - "\n", - "To query our indices, we first need to get the embedding of each query from the corresponding encoder model. We can then pass these query embeddings into the faiss index which will perform a nearest neighbor search, using cosine distance to compare the query embedding with embeddings present in the index. Once we get a list of knowledge base index concept IDs most closely matching our query, all that is left to do is map the IDs to a representative string describing the concept. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def query_index(cfg, model, index, queries, id2string):\n", - " # Get query embeddings from our entity linking encoder model\n", - " query_embs = get_query_embedding(queries, model).cpu().detach().numpy()\n", - " \n", - " # Use query embedding to find closest concept embedding in knowledge base\n", - " distances, neighbors = index.search(query_embs, cfg.index.top_n)\n", - " \n", - " # Get the canonical strings corresponding to the IDs of the query's nearest neighbors in the kb \n", - " neighbor_concepts = [[id2string[concept_id] for concept_id in query_neighbor] \\\n", - " for query_neighbor in neighbors]\n", - " \n", - " # Display most similar concepts in the knowledge base. \n", - " for query_idx in range(len(queries)):\n", - " print(f\"\\nThe most similar concepts to {queries[query_idx]} are:\")\n", - " for cid, concept, dist in zip(neighbors[query_idx], neighbor_concepts[query_idx], distances[query_idx]):\n", - " print(cid, concept, 1 - dist)\n", - "\n", - " \n", - "def get_query_embedding(queries, model):\n", - " # Tokenize our queries\n", - " model_input = model.tokenizer(queries,\n", - " add_special_tokens = True,\n", - " padding = True,\n", - " truncation = True,\n", - " max_length = 512,\n", - " return_token_type_ids = True,\n", - " return_attention_mask = True)\n", - " \n", - " # Pass tokenized input into model\n", - " query_emb = model.forward(input_ids=torch.LongTensor(model_input[\"input_ids\"]).to(device),\n", - " token_type_ids=torch.LongTensor(model_input[\"token_type_ids\"]).to(device),\n", - " attention_mask=torch.LongTensor(model_input[\"attention_mask\"]).to(device))\n", - " \n", - " return query_emb" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Load indices\n", - "sap_index = faiss.read_index(sap_model_cfg.index.index_save_name)\n", - "base_index = faiss.read_index(base_model_cfg.index.index_save_name)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Map concept IDs to one canonical string\n", - "index_data = open(sap_model_cfg.index.index_ds.data_file, \"r\", encoding='utf-8-sig')\n", - "id2string = {}\n", - "\n", - "for line in index_data:\n", - " cid, concept = line.split(\"\\t\")\n", - " id2string[int(cid) - 1] = concept.strip()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "id2string" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Some sample queries\n", - "queries = [\"high blood sugar\", \"head pain\"]\n", - "\n", - "# Query BERT Base\n", - "print(\"BERT Base output before Self Alignment Pretraining:\")\n", - "query_index(base_model_cfg, base_model, base_index, queries, id2string)\n", - "print(\"\\n\" + \"-\" * 50 + \"\\n\")\n", - "\n", - "# Query SAP BERT\n", - "print(\"SAP BERT output after Self Alignment Pretraining:\")\n", - "query_index(sap_model_cfg, sap_model, sap_index, queries, id2string)\n", - "print(\"\\n\" + \"-\" * 50 + \"\\n\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Even after only training on this tiny amount of data, the qualitative performance boost from self-alignment pretraining is visible. The baseline model links \"*high blood sugar*\" to the entity \"*6 diabetes*\" while our SAP BERT model accurately links \"*high blood sugar*\" to \"*Hyperinsulinemia*\". Similarly, \"*head pain*\" and \"*Myocardial infraction*\" are not the same concept, but \"*head pain*\" and \"*Headache*\" are." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For larger knowledge bases keeping the default embedding size might be too large and cause out of memory issues. You can apply PCA or some other dimensionality reduction method to your data to reduce its memory footprint. Code for creating a text file of all the UMLS entities in the correct format needed to build an index and creating a dictionary mapping concept ids to canonical concept strings can be found here `examples/nlp/entity_linking/data/umls_dataset_processing.py`. \n", - "\n", - "The code for extracting knowledge base concept embeddings, training and applying a PCA transformation to the embeddings, building a faiss index and querying the index from the command line is located at `examples/nlp/entity_linking/build_index.py` and `examples/nlp/entity_linking/query_index.py`. \n", - "\n", - "If you've cloned the NeMo repo, both of these steps can be run as follows on the command line from the `examples/nlp/entity_linking/` directory.\n", - "\n", - "```\n", - "python data/umls_dataset_processing.py --index\n", - "python build_index.py --restore\n", - "python query_index.py --restore\n", - "```\n", - "By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=` after each of the above commands. Intermediate steps of the index building process are saved. In the occurrence of an error, previously completed steps do not need to be rerun. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Command Recap" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here is a recap of the commands and steps to repeat this process on the full UMLS dataset. \n", - "\n", - "1) Download the UMLS dataset file `MRCONSO.RRF` from the NIH website and place it in the `examples/nlp/entity_linking/data` directory.\n", - "\n", - "2) Run the following commands from the `examples/nlp/entity_linking` directory\n", - "```\n", - "python data/umls_dataset_processing.py\n", - "python self_alignment_pretraining.py project_dir=. \n", - "python data/umls_dataset_processing.py --index\n", - "python build_index.py --restore\n", - "python query_index.py --restore\n", - "```\n", - "The model will take ~24hrs to train on two GPUs and ~48hrs to train on one GPU. By default the project directory will be \".\" but can be changed by adding the flag `--project_dir=` after each of the above commands and changing `project_dir=` in the `self_alignment_pretraining.py` command. If you change the project directory, you should also move the `MRCONOSO.RRF` file to a `data` sub directory within the one you've specified. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As mentioned in the introduction, entity linking within NVIDIA NeMo is not limited to the medical domain. The same data processing and training steps can be applied to a variety of domains and use cases. You can edit the datasets used as well as training and loss function hyperparameters within your config file to better suit your domain." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.13" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/tutorials/nlp/GLUE_Benchmark.ipynb b/tutorials/nlp/GLUE_Benchmark.ipynb deleted file mode 100644 index b77b3439b444c..0000000000000 --- a/tutorials/nlp/GLUE_Benchmark.ipynb +++ /dev/null @@ -1,566 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "GLUE_Benchmark.ipynb", - "provenance": [], - "private_outputs": true, - "collapsed_sections": [] - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "accelerator": "GPU", - "pycharm": { - "stem_cell": { - "cell_type": "raw", - "source": [], - "metadata": { - "collapsed": false - } - } - } - }, - "cells": [ - { - "cell_type": "code", - "metadata": { - "id": "o_0K1lsW1dj9", - "colab_type": "code", - "colab": {} - }, - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "BRANCH = 'main'\n!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]\n" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "pycharm": { - "name": "#%%\n" - }, - "id": "JFWG-jYCfvD7", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# If you're not using Colab, you might need to upgrade jupyter notebook to avoid the following error:\n", - "# 'ImportError: IProgress not found. Please update jupyter and ipywidgets.'\n", - "\n", - "! pip install ipywidgets\n", - "! jupyter nbextension enable --py widgetsnbextension\n", - "\n", - "# Please restart the kernel after running this cell" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "dzqD2WDFOIN-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "from nemo.collections import nlp as nemo_nlp\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "daYw_Xll2ZR9", - "colab_type": "text" - }, - "source": [ - "In this tutorial, we are going to describe how to finetune a BERT-like model based on [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) on [GLUE: A Multi-Task Benchmark and Analysis Platform for Natural Language Understanding](https://openreview.net/pdf?id=rJ4km2R5t7). \n", - "\n", - "# GLUE tasks\n", - "GLUE Benchmark includes 9 natural language understanding tasks:\n", - "\n", - "## Single-Sentence Tasks\n", - "\n", - "* CoLA - [The Corpus of Linguistic Acceptability](https://arxiv.org/abs/1805.12471) is a set of English sentences from published linguistics literature. The task is to predict whether a given sentence is grammatically correct or not.\n", - "* SST-2 - [The Stanford Sentiment Treebank](https://nlp.stanford.edu/~socherr/EMNLP2013_RNTN.pdf) consists of sentences from movie reviews and human annotations of their sentiment. The task is to predict the sentiment of a given sentence: positive or negative.\n", - "\n", - "## Similarity and Paraphrase tasks\n", - "\n", - "* MRPC - [The Microsoft Research Paraphrase Corpus](https://www.aclweb.org/anthology/I05-5002.pdf) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.\n", - "* QQP - [The Quora Question Pairs](https://www.quora.com/q/quoradata/First-Quora-Dataset-Release-Question-Pairs) dataset is a collection of question pairs from the community question-answering website Quora. The task is to determine whether a pair of questions are semantically equivalent.\n", - "* STS-B - [The Semantic Textual Similarity Benchmark](https://arxiv.org/abs/1708.00055) is a collection of sentence pairs drawn from news headlines, video, and image captions, and natural language inference data. The task is to determine how similar two sentences are.\n", - "\n", - "## Inference Tasks\n", - "\n", - "* MNLI - [The Multi-Genre Natural Language Inference Corpus](https://cims.nyu.edu/~sbowman/multinli/multinli_0.9.pdf) is a crowdsourced collection of sentence pairs with textual entailment annotations. Given a premise sentence and a hypothesis sentence, the task is to predict whether the premise entails the hypothesis (entailment), contradicts the hypothesis (contradiction), or neither (neutral). The task has the matched (in-domain) and mismatched (cross-domain) sections.\n", - "* QNLI - [The Stanford Question Answering Dataset](https://nlp.stanford.edu/pubs/rajpurkar2016squad.pdf) is a question-answering dataset consisting of question-paragraph pairs, where one of the sentences in the paragraph (drawn from Wikipedia) contains the answer to the corresponding question. The task is to determine whether the context sentence contains the answer to the question.\n", - "* RTE The Recognizing Textual Entailment (RTE) datasets come from a series of annual [textual entailment challenges](https://aclweb.org/aclwiki/Recognizing_Textual_Entailment). The task is to determine whether the second sentence is the entailment of the first one or not.\n", - "* WNLI - The Winograd Schema Challenge is a reading comprehension task in which a system must read a sentence with a pronoun and select the referent of that pronoun from a list of choices (Hector Levesque, Ernest Davis, and Leora Morgenstern. The winograd schema challenge. In Thirteenth International Conference on the Principles of Knowledge Representation and Reasoning. 2012).\n", - "\n", - "All tasks are classification tasks, except for the STS-B task which is a regression task. All classification tasks are 2-class problems, except for the MNLI task which has 3-classes.\n", - "\n", - "More details about GLUE benchmark could be found [here](https://gluebenchmark.com/)." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZnuziSwJ1yEB", - "colab_type": "text" - }, - "source": [ - "# Datasets\n", - "\n", - "**To proceed further, you need to download the GLUE data.** For example, you can download [this script](https://gist.githubusercontent.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e/raw/17b8dd0d724281ed7c3b2aeeda662b92809aadd5/download_glue_data.py) using `wget` and then execute it by running:\n", - "\n", - "`python download_glue_data.py`\n", - "\n", - "use `--tasks TASK` if datasets for only selected GLUE tasks are needed\n", - "\n", - "After running the above commands, you will have a folder `glue_data` with data folders for every GLUE task. For example, data for MRPC task would be under glue_data/MRPC.\n", - "\n", - "This tutorial and [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py) work with all GLUE tasks without any modifications. For this tutorial, we are going to use MRPC task.\n", - "\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "--wJ2891aIIE", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# supported task names: [\"cola\", \"sst-2\", \"mrpc\", \"sts-b\", \"qqp\", \"mnli\", \"qnli\", \"rte\", \"wnli\"]\n", - "TASK = 'mrpc'\n", - "DATA_DIR = 'glue_data/MRPC'\n", - "WORK_DIR = \"WORK_DIR\"\n", - "MODEL_CONFIG = 'glue_benchmark_config.yaml'" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "qB0oLE4R9EhJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "! ls -l $DATA_DIR" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "gMWuU69pbUDe", - "colab_type": "text" - }, - "source": [ - "For each task, there are 3 files: `train.tsv, dev.tsv, and test.tsv`. Note, MNLI has 2 dev sets: matched and mismatched, evaluation on both dev sets will be done automatically." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "6UDPgadLN6SG", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# let's take a look at the training data \n", - "! head -n 5 {DATA_DIR}/train.tsv" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_whKCxfTMo6Y", - "colab_type": "text" - }, - "source": [ - "# Model configuration\n", - "\n", - "Now, let's take a closer look at the model's configuration and learn to train the model.\n", - "\n", - "GLUE model is comprised of the pretrained [BERT](https://arxiv.org/pdf/1810.04805.pdf) model followed by a Sequence Regression module (for STS-B task) or Sequence classifier module (for the rest of the tasks).\n", - "\n", - "The model is defined in a config file which declares multiple important sections. They are:\n", - "- **model**: All arguments that are related to the Model - language model, a classifier, optimizer and schedulers, datasets and any other related information\n", - "\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "T1gA8PsJ13MJ", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/glue_benchmark/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "mX3KmWMvSUQw", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(OmegaConf.to_yaml(config))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ZCgWzNBkaQLZ", - "colab_type": "text" - }, - "source": [ - "# Model Training\n", - "## Setting up Data within the config\n", - "\n", - "Among other things, the config file contains dictionaries called **dataset**, **train_ds** and **validation_ds**. These are configurations used to setup the Dataset and DataLoaders of the corresponding config.\n", - "\n", - "We assume that both training and evaluation files are located in the same directory, and use the default names mentioned during the data download step. \n", - "So, to start model training, we simply need to specify `model.dataset.data_dir`, like we are going to do below.\n", - "\n", - "Also notice that some config lines, including `model.dataset.data_dir`, have `???` in place of paths, this means that values for these fields are required to be specified by the user.\n", - "\n", - "Let's now add the data directory path, task name and output directory for saving predictions to the config." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "LQHCJN-ZaoLp", - "colab_type": "code", - "colab": {} - }, - "source": [ - "config.model.task_name = TASK\n", - "config.model.output_dir = WORK_DIR\n", - "config.model.dataset.data_dir = DATA_DIR" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nB96-3sTc3yk", - "colab_type": "text" - }, - "source": [ - "## Building the PyTorch Lightning Trainer\n", - "\n", - "NeMo models are primarily PyTorch Lightning modules - and therefore are entirely compatible with the PyTorch Lightning ecosystem.\n", - "\n", - "Let's first instantiate a Trainer object" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "1tG4FzZ4Ui60", - "colab_type": "code", - "colab": {} - }, - "source": [ - "print(\"Trainer config - \\n\")\n", - "print(OmegaConf.to_yaml(config.trainer))" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "knF6QeQQdMrH", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# lets modify some trainer configs\n", - "# checks if we have GPU available and uses it\n", - "accelerator = 'gpu' if torch.cuda.is_available() else 'cpu'\n", - "config.trainer.devices = 1\n", - "config.trainer.accelerator = accelerator\n", - "\n", - "config.trainer.precision = 16 if torch.cuda.is_available() else 32\n", - "\n", - "# for mixed precision training, uncomment the line below (precision should be set to 16 and amp_level to O1):\n", - "# config.trainer.amp_level = O1\n", - "\n", - "# remove distributed training flags\n", - "config.trainer.strategy = 'auto'\n", - "\n", - "# setup max number of steps to reduce training time for demonstration purposes of this tutorial\n", - "config.trainer.max_steps = 128\n", - "\n", - "trainer = pl.Trainer(**config.trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8IlEMdVxdr6p", - "colab_type": "text" - }, - "source": [ - "## Setting up a NeMo Experiment\n", - "\n", - "NeMo has an experiment manager that handles logging and checkpointing for us, so let's use it:" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "8uztqGAmdrYt", - "colab_type": "code", - "colab": {} - }, - "source": [ - "exp_dir = exp_manager(trainer, config.get(\"exp_manager\", None))\n", - "\n", - "# the exp_dir provides a path to the current experiment for easy access\n", - "exp_dir = str(exp_dir)\n", - "exp_dir" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8tjLhUvL_o7_", - "colab_type": "text" - }, - "source": [ - "Before initializing the model, we might want to modify some of the model configs. For example, we might want to modify the pretrained BERT model and use [Megatron-LM BERT](https://arxiv.org/abs/1909.08053) or [AlBERT model](https://arxiv.org/abs/1909.11942):" - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "Xeuc2i7Y_nP5", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# get the list of supported BERT-like models, for the complete list of HugginFace models, see https://huggingface.co/models\n", - "print(nemo_nlp.modules.get_pretrained_lm_models_list(include_external=True))\n", - "\n", - "# specify BERT-like model, you want to use, for example, \"megatron-bert-345m-uncased\" or 'bert-base-uncased'\n", - "PRETRAINED_BERT_MODEL = \"albert-base-v1\"" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "metadata": { - "id": "RK2xglXyAUOO", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# add the specified above model parameters to the config\n", - "config.model.language_model.pretrained_model_name = PRETRAINED_BERT_MODEL" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fzNZNAVRjDD-", - "colab_type": "text" - }, - "source": [ - "Now, we are ready to initialize our model. During the model initialization call, the dataset and data loaders we'll be prepared for training and evaluation.\n", - "Also, the pretrained BERT model will be downloaded, note it can take up to a few minutes depending on the size of the chosen BERT model." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "NgsGLydWo-6-", - "colab_type": "code", - "colab": {} - }, - "source": [ - "model = nemo_nlp.models.GLUEModel(cfg=config.model, trainer=trainer)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kQ592Tx4pzyB", - "colab_type": "text" - }, - "source": [ - "## Monitoring training progress\n", - "Optionally, you can create a Tensorboard visualization to monitor training progress." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "mTJr16_pp0aS", - "colab_type": "code", - "colab": {} - }, - "source": [ - "try:\n", - " from google import colab\n", - " COLAB_ENV = True\n", - "except (ImportError, ModuleNotFoundError):\n", - " COLAB_ENV = False\n", - "\n", - "# Load the TensorBoard notebook extension\n", - "if COLAB_ENV:\n", - " %load_ext tensorboard\n", - " %tensorboard --logdir {exp_dir}\n", - "else:\n", - " print(\"To use tensorboard, please use this notebook in a Google Colab environment.\")" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "CFgAlaIdndjW", - "colab_type": "text" - }, - "source": [ - "Note, it’s recommended to finetune the model on each task separately. Also, based on [GLUE Benchmark FAQ#12](https://gluebenchmark.com/faq), there are might be some differences in dev/test distributions for QQP task and in train/dev for WNLI task." - ] - }, - { - "cell_type": "code", - "metadata": { - "id": "hUvnSpyjp0Dh", - "colab_type": "code", - "colab": {} - }, - "source": [ - "# start model training\n", - "trainer.fit(model)" - ], - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ref1qSonGNhP", - "colab_type": "text" - }, - "source": [ - "## Training Script\n", - "\n", - "If you have NeMo installed locally, you can also train the model with [examples/nlp/glue_benchmark/glue_benchmark.py](https://github.com/NVIDIA/NeMo/blob/stable/examples/nlp/glue_benchmark/glue_benchmark.py).\n", - "\n", - "To run training script, use:\n", - "\n", - "`python glue_benchmark.py \\\n", - " model.dataset.data_dir=PATH_TO_DATA_DIR \\\n", - " model.task_name=TASK`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KVPFofXaoKNE", - "colab_type": "text" - }, - "source": [ - "Average results after 3 runs:\n", - "\n", - "| Task | Metric | ALBERT-large | ALBERT-xlarge | Megatron-345m | BERT base paper | BERT large paper |\n", - "|-------|--------------------------|--------------|---------------|---------------|-----------------|------------------|\n", - "| CoLA | Matthew's correlation | 54.94 | 61.72 | 64.56 | 52.1 | 60.5 |\n", - "| SST-2 | Accuracy | 92.74 | 91.86 | 95.87 | 93.5 | 94.9 |\n", - "| MRPC | F1/Accuracy | 92.05/88.97 | 91.87/88.61 | 92.36/89.46 | 88.9/- | 89.3/- |\n", - "| STS-B | Person/Spearman corr. | 90.41/90.21 | 90.07/90.10 | 91.51/91.61 | -/85.8 | -/86.5 |\n", - "| QQP | F1/Accuracy | 88.26/91.26 | 88.80/91.65 | 89.18/91.91 | 71.2/- | 72.1/- |\n", - "| MNLI | Matched /Mismatched acc. | 86.69/86.81 | 88.66/88.73 | 89.86/89.81 | 84.6/83.4 | 86.7/85.9 |\n", - "| QNLI | Accuracy | 92.68 | 93.66 | 94.33 | 90.5 | 92.7 |\n", - "| RTE | Accuracy | 80.87 | 82.86 | 83.39 | 66.4 | 70.1 |\n", - "\n", - "WNLI task was excluded from the experiments due to the problematic WNLI set.\n", - "The dev sets were used for evaluation for ALBERT and Megatron models, and the test sets results for [the BERT paper](https://arxiv.org/abs/1810.04805).\n", - "\n", - "Hyperparameters used to get the results from the above table, could be found in the table below. Some tasks could be further finetuned to improve performance numbers, the tables are for a baseline reference only.\n", - "Each cell in the table represents the following parameters:\n", - "Number of GPUs used/ Batch Size/ Learning Rate/ Number of Epochs. For not specified parameters, please refer to the default parameters in the training script.\n", - "\n", - "| Task | ALBERT-large | ALBERT-xlarge | Megatron-345m |\n", - "|-------|--------------|---------------|---------------|\n", - "| CoLA | 1 / 32 / 1e-5 / 3 | 1 / 32 / 1e-5 / 10 | 4 / 16 / 2e-5 / 12 |\n", - "| SST-2 | 4 / 16 / 2e-5 / 5 | 4 / 16 / 2e-5 /12 | 4 / 16 / 2e-5 / 12 |\n", - "| MRPC | 1 / 32 / 1e-5 / 5 | 1 / 16 / 2e-5 / 5 | 1 / 16 / 2e-5 / 10 |\n", - "| STS-B | 1 / 16 / 2e-5 / 5 | 1 / 16 / 4e-5 / 12 | 4 / 16 / 3e-5 / 12 |\n", - "| QQP | 1 / 16 / 2e-5 / 5 | 4 / 16 / 1e-5 / 12 | 4 / 16 / 1e-5 / 12 |\n", - "| MNLI | 4 / 64 / 1e-5 / 5 | 4 / 32 / 1e-5 / 5 | 4 / 32 / 1e-5 / 5 | \n", - "| QNLI | 4 / 16 / 1e-5 / 5 | 4 / 16 / 1e-5 / 5 | 4 / 16 / 2e-5 / 5 | \n", - "| RTE | 1 / 16 / 1e-5 / 5 | 1 / 16 / 1e-5 / 12 | 4 / 16 / 3e-5 / 12 |\n" - ] - } - ] -} diff --git a/tutorials/nlp/MegatronBert_export.ipynb b/tutorials/nlp/MegatronBert_export.ipynb deleted file mode 100644 index c19c07b670051..0000000000000 --- a/tutorials/nlp/MegatronBert_export.ipynb +++ /dev/null @@ -1,280 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "id": "8046e96a", - "metadata": {}, - "outputs": [], - "source": [ - "BRANCH='main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "38bfe8ea", - "metadata": {}, - "outputs": [], - "source": [ - "\"\"\"\n", - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run this cell to set up dependencies.\n", - "\"\"\"\n", - "# If you're using Google Colab and not running locally, run this cell\n", - "\n", - "# install NeMo\n", - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98c00a93", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import wget \n", - "import torch\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf" - ] - }, - { - "cell_type": "markdown", - "id": "e9fb1a66", - "metadata": {}, - "source": [ - "### Deprecation Notice\n", - "\n", - "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n", - "\n", - "---\n", - "\n", - "# Task Description\n", - "In this tutorial, we are going to describe how to export NeMo NLP models with BERT based models as the pre-trained model." - ] - }, - { - "cell_type": "markdown", - "id": "dd0fb016", - "metadata": {}, - "source": [ - "## Convert the Megatron-LM Weights to Nemo file\n", - "\n", - "If you prefer to use the Huggingface BERT models, please skip this section and refer to `Setting up a NeMo Experiment` section to load a model from `nemo_nlp.modules.get_pretrained_lm_models_list()`\n", - "\n", - "NeMo Megatron BERT can [load from a pretrained model](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/core/core.html?highlight=nemo%20file#restore) using `.nemo` file. We can convert the Megatron-LM checkpoint to the `.nemo` file. Let's first download the pretrained model weights and vocabulary file." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e451f219", - "metadata": {}, - "outputs": [], - "source": [ - "from nemo.collections.nlp.modules.common.megatron.megatron_utils import MEGATRON_CONFIG_MAP\n", - "import pathlib\n", - "\n", - "PRETRAINED_BERT_MODEL = \"megatron-bert-345m-uncased\" # specify BERT-like model from MEGATRON_CONFIG_MAP.keys()\n", - "nemo_out_path = \"qa_pretrained.nemo\" # the nemo output file name\n", - "\n", - "checkpoint_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['checkpoint']\n", - "vocab_url = MEGATRON_CONFIG_MAP[PRETRAINED_BERT_MODEL]['vocab']\n", - "checkpoint_filename = pathlib.Path(checkpoint_url).name\n", - "vocab_filename = pathlib.Path(vocab_url).name\n", - "if not pathlib.Path(checkpoint_filename).exists():\n", - " print('downloading from checkpoint url', checkpoint_url)\n", - " !wget $checkpoint_url\n", - "if not pathlib.Path(vocab_filename).exists():\n", - " print('downloading from vocab url', vocab_url)\n", - " !wget $vocab_url" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7586b5c0", - "metadata": {}, - "outputs": [], - "source": [ - "WORK_DIR = \"WORK_DIR\"\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "\n", - "# Prepare the model parameters \n", - "# download the model's configuration file \n", - "config_dir = WORK_DIR + '/configs/'\n", - "MODEL_CONFIG = \"megatron_bert_config.yaml\"\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + MODEL_CONFIG):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/conf/' + MODEL_CONFIG, config_dir)\n", - "else:\n", - " print ('config file is already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e0dd3124", - "metadata": {}, - "outputs": [], - "source": [ - "# this line will print the entire config of the model\n", - "config_path = f'{WORK_DIR}/configs/{MODEL_CONFIG}'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "\n", - "config.model.megatron_legacy = True # set to true if you trained the NLP model on NeMo < 1.5.0\n", - "config.model.bias_gelu_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export \n", - "config.model.masked_softmax_fusion = False # set to true if you want the MegatronLM to NeMo conversion for training; and set to false to use the converted model at time of export\n", - "\n", - "config.model.num_layers = 24\n", - "config.model.hidden_size = 1024\n", - "config.model.ffn_hidden_size = 4096\n", - "config.model.num_attention_heads = 16\n", - "config.model.tokenizer.vocab_file = vocab_filename\n", - "config.model.tokenizer.type = 'BertWordPieceLowerCase' # change this to BertWordPieceCase if you are using a cased pretrained model\n", - "config.model.tensor_model_parallel_size = 1\n", - "config.model.data.data_prefix = ''\n", - "config.model.max_position_embeddings = 512\n", - "config.model.data.seq_length = 512\n", - "config.cfg = {}\n", - "config.cfg.cfg = config.model\n", - "with open('hparams.yaml', 'w') as f:\n", - " f.write(OmegaConf.to_yaml(config.cfg))\n", - "if(config.model.megatron_legacy):\n", - " checkpoint_filename = \"model_optim_rng_ca.pt\" #provide path to the pretrained pt file you used during training on NeMo < 1.5.0, for NeMo >= 1.5.0\n", - "print(checkpoint_filename)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47dca6de", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "PWD = os.getcwd()\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/language_modeling/megatron_lm_ckpt_to_nemo.py')\n", - "!python -m torch.distributed.run --nproc_per_node=1 megatron_lm_ckpt_to_nemo.py --checkpoint_folder=$PWD --checkpoint_name=$checkpoint_filename --hparams_file=$PWD/hparams.yaml --nemo_file_path=$PWD/$nemo_out_path --model_type=bert --tensor_model_parallel_size=1" - ] - }, - { - "cell_type": "markdown", - "id": "1ae8d31b", - "metadata": {}, - "source": [ - "# Legacy NLP Bert based model conversion\n", - "\n", - "Step 1: Convert legacy nemo checkpoint to a checkpoint which is currently supported by nemo\n", - "\n", - "Step 2: Use the converted model from step 1 to export the nemo file to the required format" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "86639a3d", - "metadata": {}, - "outputs": [], - "source": [ - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/nemo_legacy_import/nlp_checkpoint_port.py')\n", - "wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/scripts/export.py')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "48820d57", - "metadata": {}, - "outputs": [], - "source": [ - "legacy_nemo_file_path = \"/NeMo/megatron_multiqa.nemo\" #path to you model trained on NeMo < 1.5\n", - "nemo_converted_out_path = \"converted_megatron_multiqa.nemo\"\n", - "megatron_absolute_language_model_path = \"/NeMo/tutorials/nlp/qa_pretrained.nemo\" # Give the absolute path of the model you obtained using megatron_lm_ckpt_to_nemo\n", - "onnx_export_out_path = \"onnx_megatron_multiqa.onnx\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7191e0cb", - "metadata": {}, - "outputs": [], - "source": [ - "os.system(f\"python nlp_checkpoint_port.py {legacy_nemo_file_path} {nemo_converted_out_path} --megatron-legacy=True --megatron-checkpoint {megatron_absolute_language_model_path}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ccc720ef", - "metadata": {}, - "outputs": [], - "source": [ - "os.system(f\"python export.py {nemo_converted_out_path} {onnx_export_out_path} --autocast --runtime-check\")" - ] - }, - { - "cell_type": "markdown", - "id": "f10461f2", - "metadata": {}, - "source": [ - "# Convert a NLP model with BERT based pre-trained model trained on NeMo >= 1.5.0\n", - "\n", - "For models trained on NeMo >= 1.5.0, you just run the export script and skip the legacy conversion part" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0514ab37", - "metadata": {}, - "outputs": [], - "source": [ - "nemo_file_path = \"\"\n", - "onnx_export_out_path = " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1d6b5db4", - "metadata": {}, - "outputs": [], - "source": [ - "python export.py $nemo_converted_out_path $onnx_export_out_path --autocast --runtime-check" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/tutorials/nlp/Question_Answering.ipynb b/tutorials/nlp/Question_Answering.ipynb deleted file mode 100644 index 054928245d9d7..0000000000000 --- a/tutorials/nlp/Question_Answering.ipynb +++ /dev/null @@ -1,1163 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "tiIOhb7iVC3J" - }, - "source": [ - "# Overview" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PucJwfbhVC3L" - }, - "source": [ - "### Deprecation Notice\n", - "\n", - "This tutorial is deprecated as of r1.23.0 and will be removed in the next release.\n", - "\n", - "---\n", - "\n", - "This tutorial will demonstrate how to train, evaluate, and test three types of models for Question-Answering -\n", - "1. BERT-like models for Extractive Question-Answering\n", - "2. Sequence-to-Sequence (S2S) models for Generative Question-Answering (ex. T5/BART-like)\n", - "3. GPT-like models for Generative Question-Answering\n", - "\n", - "## Task Description\n", - "\n", - "- Given a context and a natural language query, we want to generate an answer for the query\n", - "- Depending on how the answer is generated, the task can be broadly divided into two types:\n", - " 1. Extractive Question Answering\n", - " 2. Generative Question Answering\n", - "\n", - "\n", - "### Extractive Question-Answering with BERT-like models\n", - "\n", - "Given a question and a context, both in natural language, predict the span within the context with a start and end position which indicates the answer to the question.\n", - "For every word in our training dataset we’re going to predict:\n", - "- likelihood this word is the start of the span \n", - "- likelihood this word is the end of the span\n", - "\n", - "We are using a BERT encoder with 2 span prediction heads for predicting start and end position of the answer. The span predictions are token classifiers consisting of a single linear layer.\n", - "\n", - "### Generative Question-Answering with S2S and GPT-like models\n", - "\n", - "Given a question and a context, both in natural language, generate an answer for the question. Unlike the BERT-like models, there is no constraint that the answer should be a span within the context." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IpX0w2PtVC3M" - }, - "source": [ - "# Installing NeMo" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "72XWYFQYVC3M" - }, - "source": [ - "You can run either this notebook locally (if you have all the dependencies and a GPU) or on Google Colab.\n", - "\n", - "Instructions for setting up Colab are as follows:\n", - "1. Open a new Python 3 notebook.\n", - "2. Import this notebook from GitHub (File -> Upload Notebook -> \"GITHUB\" tab -> copy/paste GitHub URL)\n", - "3. Connect to an instance with a GPU (Runtime -> Change runtime type -> select \"GPU\" for hardware accelerator)\n", - "4. Run the cell below to set up dependencies." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_xQBtr0KVC3M" - }, - "outputs": [], - "source": [ - "BRANCH = 'main'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9R1D6W58VC3N" - }, - "outputs": [], - "source": [ - "!python -m pip install git+https://github.com/NVIDIA/NeMo.git@$BRANCH#egg=nemo_toolkit[nlp]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fof5-57iVC3N" - }, - "source": [ - "# Imports and constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KqKD-wReVC3O" - }, - "outputs": [], - "source": [ - "import os\n", - "import wget\n", - "import gc\n", - "\n", - "import pytorch_lightning as pl\n", - "from omegaconf import OmegaConf\n", - "\n", - "from nemo.collections.nlp.models.question_answering.qa_bert_model import BERTQAModel\n", - "from nemo.collections.nlp.models.question_answering.qa_gpt_model import GPTQAModel\n", - "from nemo.collections.nlp.models.question_answering.qa_s2s_model import S2SQAModel\n", - "from nemo.utils.exp_manager import exp_manager\n", - "\n", - "pl.seed_everything(42)\n", - "gc.disable()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xhPr9Jf_VC3O" - }, - "outputs": [], - "source": [ - "# set the following paths\n", - "DATA_DIR = \"data_dir\" # directory for storing datasets\n", - "WORK_DIR = \"work_dir\" # directory for storing trained models, logs, additionally downloaded scripts\n", - "\n", - "os.makedirs(DATA_DIR, exist_ok=True)\n", - "os.makedirs(WORK_DIR, exist_ok=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dWymW8e0VC3O" - }, - "source": [ - "# Configuration" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0YhKTkuXVC3P" - }, - "source": [ - "The model is defined in a config file which declares multiple important sections:\n", - "- **model**: All arguments that will relate to the Model - language model, span prediction, optimizer and schedulers, datasets and any other related information\n", - "- **trainer**: Any argument to be passed to PyTorch Lightning\n", - "- **exp_manager**: All arguments used for setting up the experiment manager - target directory, name, logger information\n", - "\n", - "We will download the default config file provided at `NeMo/examples/nlp/question_answering/conf/qa_conf.yaml` and edit necessary values for training different models" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WOIWJqQ0VC3P" - }, - "outputs": [], - "source": [ - "# download the model's default configuration file \n", - "config_dir = WORK_DIR + '/conf/'\n", - "os.makedirs(config_dir, exist_ok=True)\n", - "if not os.path.exists(config_dir + \"qa_conf.yaml\"):\n", - " print('Downloading config file...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/conf/qa_conf.yaml', config_dir)\n", - "else:\n", - " print ('config file already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cvD-gv-FVC3P" - }, - "outputs": [], - "source": [ - "# this will print the entire default config of the model\n", - "config_path = f'{WORK_DIR}/conf/qa_conf.yaml'\n", - "print(config_path)\n", - "config = OmegaConf.load(config_path)\n", - "print(\"Default Config - \\n\")\n", - "print(OmegaConf.to_yaml(config))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "E08e-ItPVC3P" - }, - "source": [ - "# Training and testing models on SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xn022MsKVC3Q" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "c356CGL1VC3Q" - }, - "source": [ - "For this example, we are going to download the [SQuAD](https://rajpurkar.github.io/SQuAD-explorer/) dataset to showcase how to do training and inference. There are two datasets, SQuAD1.0 and SQuAD2.0. SQuAD 1.1, the previous version of the SQuAD dataset, contains 100,000+ question-answer pairs on 500+ articles. SQuAD2.0 dataset combines the 100,000 questions in SQuAD1.1 with over 50,000 unanswerable questions written adversarially by crowdworkers to look similar to answerable ones. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Gaju1h_bVC3Q" - }, - "source": [ - "To download both datasets, we use `NeMo/examples/nlp/question_answering/get_squad.py`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nb840_bZVC3Q" - }, - "outputs": [], - "source": [ - "# download get_squad.py script to download and preprocess the SQuAD data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/get_squad.py'):\n", - " print('Downloading get_squad.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/get_squad.py', WORK_DIR)\n", - "else:\n", - " print ('get_squad.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "sOgY0tRzVC3Q" - }, - "outputs": [], - "source": [ - "# download and preprocess the data\n", - "!python $WORK_DIR/get_squad.py --destDir $DATA_DIR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nprGkyvRVC3Q" - }, - "source": [ - "After execution of the above cell, your data folder will contain a subfolder \"squad\" the following four files for training and evaluation\n", - "\n", - "```\n", - "squad \n", - "│\n", - "└───v1.1\n", - "│ │ - train-v1.1.json\n", - "│ │ - dev-v1.1.json\n", - "│\n", - "└───v2.0\n", - " │ - train-v2.0.json\n", - " │ - dev-v2.0.json\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "GX0KWQXKVC3Q" - }, - "outputs": [], - "source": [ - "!ls -LR {DATA_DIR}/squad" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RFVcvseOVC3R" - }, - "source": [ - "## Set dataset config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Grb0EeRqVC3R" - }, - "outputs": [], - "source": [ - "# if True, model will load features from cache if file is present, or\n", - "# create features and dump to cache file if not already present\n", - "config.model.dataset.use_cache = False\n", - "\n", - "# indicates whether the dataset has unanswerable questions\n", - "config.model.dataset.version_2_with_negative = True\n", - "\n", - "# indicates whether the dataset is of extractive nature or not\n", - "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n", - "config.model.dataset.check_if_answer_in_context = True\n", - "\n", - "# set file paths for train, validation, and test datasets\n", - "config.model.train_ds.file = f\"{DATA_DIR}/squad/v2.0/train-v2.0.json\"\n", - "config.model.validation_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n", - "config.model.test_ds.file = f\"{DATA_DIR}/squad/v2.0/dev-v2.0.json\"\n", - "\n", - "# set batch sizes for train, validation, and test datasets\n", - "config.model.train_ds.batch_size = 8\n", - "config.model.validation_ds.batch_size = 8\n", - "config.model.test_ds.batch_size = 8\n", - "\n", - "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n", - "config.model.train_ds.num_samples = 5000\n", - "config.model.validation_ds.num_samples = 1000\n", - "config.model.test_ds.num_samples = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rFWF41VwVC3R" - }, - "source": [ - "## Set trainer config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "42yif-GIVC3R" - }, - "outputs": [], - "source": [ - "config.trainer.max_epochs = 1\n", - "config.trainer.max_steps = -1 # takes precedence over max_epochs\n", - "config.trainer.precision = 16\n", - "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use [0] this tutorial does not support multiple GPUs. If needed please use NeMo/examples/nlp/question_answering/question_answering.py\n", - "config.trainer.accelerator = \"gpu\"\n", - "config.trainer.strategy=\"auto\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EDQzMBlbVC3R" - }, - "source": [ - "## Set experiment manager config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pxY4rnJBVC3R" - }, - "outputs": [], - "source": [ - "config.exp_manager.exp_dir = WORK_DIR\n", - "config.exp_manager.name = \"QA-SQuAD2\"\n", - "config.exp_manager.create_wandb_logger=False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "N2_C8reNVC3R" - }, - "source": [ - "## BERT model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4Mf-_rioVC3R" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gtlGHzVJVC3R" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"bert-base-uncased\"\n", - "config.model.tokenizer.tokenizer_name = \"bert-base-uncased\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bert_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 3e-5" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RaM7fe8rVC3R" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ukLzGmy9VC3R" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(**config.trainer)\n", - "model = BERTQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qZIA69rlVC3R" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "asutB9ZzVC3R" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "n5AIv0SEVC3S" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7k5kD6tvVC3S" - }, - "outputs": [], - "source": [ - "model = BERTQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zyh0SNiyVC3S" - }, - "source": [ - "## S2S BART model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Sy9IYgVYVC3S" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PKNmHKV5VC3S" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n", - "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 5e-5\n", - "\n", - "#remove vocab_file from gpt model\n", - "config.model.tokenizer.vocab_file = None" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S_0glS4yVC3S" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "8jWyHY1oVC3S" - }, - "outputs": [], - "source": [ - "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n", - "# !rm -r /root/.cache/huggingface/\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "model = S2SQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xg-j39b4VC3S" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ocsf0EBDVC3S" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Vs3pl0VMVC3S" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NoW6_GO_VC3S" - }, - "outputs": [], - "source": [ - "model = S2SQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "a7-iInbPVC3S" - }, - "source": [ - "## GPT2 model for SQuAD v2.0" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VaIC0l2aVC3S" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5j6SVk6fVC3S" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"gpt2\"\n", - "config.model.tokenizer.tokenizer_name = \"gpt2\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/gpt2_squad_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 1e-4" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rWhhEuvzVC3S" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vBtP3ukDVC3S" - }, - "outputs": [], - "source": [ - "# uncomment below line and run if you get an error while initializing tokenizer on Colab (reference: https://github.com/huggingface/transformers/issues/8690)\n", - "# !rm -r /root/.cache/huggingface/\n", - "\n", - "trainer = pl.Trainer(**config.trainer)\n", - "model = GPTQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "EApFrJh8VC3T" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zYo2JDdOVC3T" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "6aNEt06fVC3T" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ioLT4DVbVC3T" - }, - "outputs": [], - "source": [ - "model = GPTQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hTWOlD9AVC3T" - }, - "source": [ - "# Training and testing models on MS-MARCO" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lZWsMwnGVC3T" - }, - "source": [ - "## Dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "pRUAwgAbVC3T" - }, - "source": [ - "### Downloading the data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qz3DO9JGVC3T" - }, - "source": [ - "MS-MARCO(Microsoft Machine Reading Comprehension) is a large scale dataset focused on machine reading comprehension, question answering, and passage ranking. MS-MARCO consists of 1,010,916 queries generated from real, anonymized Bing user queries. The contexts are extracted from real web documents and the answers are generated by humans.\n", - "\n", - "Please agree to the Terms of Use at https://microsoft.github.io/msmarco/ before downloading the data\n", - "\n", - "The data can be downloaded at:\n", - "- https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz\n", - "- https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Fm5MzZ91inP5" - }, - "outputs": [], - "source": [ - "os.makedirs(os.path.join(DATA_DIR, \"msmarco\"), exist_ok=True)\n", - "\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/train_v2.1.json.gz -P $DATA_DIR/msmarco\n", - "!gunzip $DATA_DIR/msmarco/train_v2.1.json.gz\n", - "\n", - "!wget https://msmarco.blob.core.windows.net/msmarco/dev_v2.1.json.gz -P $DATA_DIR/msmarco\n", - "!gunzip $DATA_DIR/msmarco/dev_v2.1.json.gz" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nDmFHzBtVC3T" - }, - "source": [ - "### Converting to SQuAD format\n", - "\n", - "The script for converting MS-MARCO dataset to SQuAD can be found at `NeMo/examples/nlp/question_answering/convert_msmarco_to_squad_format.py`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tJtNIzZQVC3T" - }, - "outputs": [], - "source": [ - "# download convert_msmarco_to_squad_format.py script to format the MS-MARCO data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/convert_msmarco_to_squad_format.py'):\n", - " print('Downloading convert_msmarco_to_squad_format.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/question_answering/convert_msmarco_to_squad_format.py', WORK_DIR)\n", - "else:\n", - " print ('convert_msmarco_to_squad_format.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Io_esJPSuBcW" - }, - "outputs": [], - "source": [ - "# we will exclude examples from MS-MARCO dataset that do not have a wellFormedAnswer using a utility script\n", - "# download remove_ms_marco_samples_without_wellFormedAnswers.py script to format the MS-MARCO data\n", - "os.makedirs(WORK_DIR, exist_ok=True)\n", - "if not os.path.exists(WORK_DIR + '/remove_ms_marco_samples_without_wellFormedAnswers.py'):\n", - " print('Downloading remove_ms_marco_samples_without_wellFormedAnswers.py...')\n", - " wget.download(f'https://raw.githubusercontent.com/NVIDIA/NeMo/{BRANCH}/examples/nlp/dialogue/remove_ms_marco_samples_without_wellFormedAnswers.py', WORK_DIR)\n", - "else:\n", - " print ('remove_ms_marco_samples_without_wellFormedAnswers.py already exists')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cs_CXkfXuYVQ" - }, - "outputs": [], - "source": [ - "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/train_v2.1.json\n", - "!python $WORK_DIR/remove_ms_marco_samples_without_wellFormedAnswers.py --filename $DATA_DIR/msmarco/dev_v2.1.json" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AUAKI086VC3T" - }, - "outputs": [], - "source": [ - "!(python $WORK_DIR/convert_msmarco_to_squad_format.py \\\n", - " --msmarco_train_input_filepath=$DATA_DIR/msmarco/train_v2.1.json \\\n", - " --msmarco_dev_input_filepath=$DATA_DIR/msmarco/dev_v2.1.json \\\n", - " --converted_train_save_path=$DATA_DIR/msmarco/msmarco-squad-format-train-v2.1.json \\\n", - " --converted_dev_save_path=$DATA_DIR/msmarco/msmarco-squad-format-dev-v2.1.json \\\n", - " --exclude_negative_samples=False \\\n", - " --keep_only_relevant_passages=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AeHesaFcVC3T" - }, - "source": [ - "## Set dataset config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rhx-_1X3VC3T" - }, - "outputs": [], - "source": [ - "# if True, model will load features from cache if file is present, or\n", - "# create features and dump to cache file if not already present\n", - "config.model.dataset.use_cache = False\n", - "\n", - "# indicates whether the dataset has unanswerable questions\n", - "config.model.dataset.version_2_with_negative = True\n", - "\n", - "# if True, context spans/chunks that do not contain answer are treated as unanswerable \n", - "# should be False for MS-MARCO dataset, or other datasets of generative nature\n", - "config.model.dataset.check_if_answer_in_context = False\n", - "\n", - "# set file paths for train, validation, and test datasets\n", - "config.model.train_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-train-v2.1.json\"\n", - "config.model.validation_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n", - "config.model.test_ds.file = f\"{DATA_DIR}/msmarco/msmarco-squad-format-dev-v2.1.json\"\n", - "\n", - "# set batch sizes for train, validation, and test datasets\n", - "config.model.train_ds.batch_size = 16\n", - "config.model.validation_ds.batch_size = 16\n", - "config.model.test_ds.batch_size = 16\n", - "\n", - "# set number of samples to be used from dataset. setting to -1 uses entire dataset\n", - "config.model.train_ds.num_samples = 5000\n", - "config.model.validation_ds.num_samples = 1000\n", - "config.model.test_ds.num_samples = 100" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "X43k_EeqVC3T" - }, - "source": [ - "## Set trainer config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HavpkQLPVC3U" - }, - "outputs": [], - "source": [ - "config.trainer.max_epochs = 1\n", - "config.trainer.max_steps = -1 # takes precedence over max_epochs\n", - "config.trainer.precision = 16\n", - "config.trainer.devices = [0] # 0 for CPU, or list of the GPUs to use e.g. [0, 1] or [0]\n", - "config.trainer.accelerator = \"gpu\"" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R-_FIZE2VC3U" - }, - "source": [ - "## Set experiment manager config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "10TT3okiVC3U" - }, - "outputs": [], - "source": [ - "config.exp_manager.exp_dir = WORK_DIR\n", - "config.exp_manager.name = \"QA-MSMARCO\"\n", - "config.exp_manager.create_wandb_logger=False" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "MKIq6YT-VC3U" - }, - "source": [ - "## S2S BART model for MS-MARCO" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvf-QpYLVC3U" - }, - "source": [ - "### Set model config values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DDVZ1a5fVC3U" - }, - "outputs": [], - "source": [ - "# set language model and tokenizer to be used\n", - "# tokenizer is derived from model if a tokenizer name is not provided\n", - "config.model.language_model.pretrained_model_name = \"facebook/bart-base\"\n", - "config.model.tokenizer.tokenizer_name = \"facebook/bart-base\"\n", - "\n", - "# path where model will be saved\n", - "config.model.nemo_path = f\"{WORK_DIR}/checkpoints/bart_msmarco_v2_0.nemo\"\n", - "\n", - "config.exp_manager.create_checkpoint_callback = True\n", - "\n", - "config.model.optim.lr = 5e-5" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3N75cdLRVC3U" - }, - "source": [ - "### Create trainer and initialize model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Bv9UMkfxVC3U" - }, - "outputs": [], - "source": [ - "trainer = pl.Trainer(**config.trainer)\n", - "model = S2SQAModel(config.model, trainer=trainer)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "BhVuV9sWVC3U" - }, - "source": [ - "### Train, test, and save the model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1JeaJ_OgVC3U" - }, - "outputs": [], - "source": [ - "trainer.fit(model)\n", - "trainer.test(model)\n", - "\n", - "model.save_to(config.model.nemo_path)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "yj0dGexaVC3U" - }, - "source": [ - "### Load the saved model and run inference" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l1elN-WDVC3U" - }, - "outputs": [], - "source": [ - "model = S2SQAModel.restore_from(config.model.nemo_path)\n", - "\n", - "eval_device = [config.trainer.devices[0]] if isinstance(config.trainer.devices, list) else 1\n", - "model.trainer = pl.Trainer(\n", - " devices=eval_device,\n", - " accelerator=config.trainer.accelerator,\n", - " precision=16,\n", - " logger=False,\n", - ")\n", - "\n", - "config.exp_manager.create_checkpoint_callback = False\n", - "exp_dir = exp_manager(model.trainer, config.exp_manager)\n", - "output_nbest_file = os.path.join(exp_dir, \"output_nbest_file.json\")\n", - "output_prediction_file = os.path.join(exp_dir, \"output_prediction_file.json\")\n", - "\n", - "all_preds, all_nbest = model.inference(\n", - " config.model.test_ds.file,\n", - " output_prediction_file=output_prediction_file,\n", - " output_nbest_file=output_nbest_file,\n", - " num_samples=10, # setting to -1 will use all samples for inference\n", - ")\n", - "\n", - "for question_id in all_preds:\n", - " print(all_preds[question_id])" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "collapsed_sections": [], - "name": "Question_Answering.ipynb", - "provenance": [] - }, - "gpuClass": "standard", - "kernelspec": { - "display_name": "Python 3.8.0 ('test_ptl_1.7')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.0" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "e987a19b1bc60996a600adb5d563aa4a4c022e7b31abb2e65c324714934e8ea9" - } - } - }, - "nbformat": 4, - "nbformat_minor": 0 -} diff --git a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb b/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb deleted file mode 100644 index 71c7ca5051443..0000000000000 --- a/tutorials/nlp/SpellMapper_English_ASR_Customization.ipynb +++ /dev/null @@ -1,1412 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "PiRuohn_FQco" - }, - "source": [ - "# Overview\n", - "This tutorial demonstrates how to run inference with [SpellMapper](https://arxiv.org/abs/2306.02317) - a model for Spellchecking ASR (Automatic Speech Recognition) Customization.\n", - "\n", - "Estimated time: 10-15 min.\n", - "\n", - "SpellMapper is a non-autoregressive (NAR) model based on transformer architecture ([BERT](https://arxiv.org/pdf/1810.04805.pdf) with multiple separators).\n", - "It gets as input a single ASR hypothesis (text) and a **custom vocabulary** and predicts which fragments in the ASR hypothesis should be replaced by which custom words/phrases if any.\n", - "\n", - "This model is an alternative to word boosting/shallow fusion approaches:\n", - " - does not require retraining ASR model;\n", - " - does not require beam-search/language model(LM);\n", - " - can be applied on top of any English ASR model output;" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qm5wmxVEGXgH" - }, - "source": [ - "## What is custom vocabulary?\n", - "**Custom vocabulary** is a list of words/phrases that are important for a particular user. For example, user's contact names, playlist, selected terminology and so on. The size of the custom vocabulary can vary from several hundreds to **several thousand entries** - but this is not an equivalent to ngram language model.\n", - "\n", - "![Scope of customization with user vocabulary](images/spellmapper_customization_vocabulary.png)\n", - "\n", - "Note that unlike traditional spellchecking approaches, which aim to correct known words using language models, the goal of contextual spelling correction is to correct highly specific user terms, most of which can be 1) out-of-vocabulary (OOV) words, 2) spelling variations (e.g., \"John Koehn\", \"Jon Cohen\") and language models cannot help much with that." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D5_XwuXDOKho" - }, - "source": [ - "## Tutorial Plan\n", - "\n", - "1. Create a sample custom vocabulary using some medical terminology.\n", - "2. Study what customization does - a detailed analysis of a small example.\n", - "3. Run a bigger example:\n", - " * Create sample ASR results by running TTS (text-to-speech synthesis) + ASR on some medical paper abstracts.\n", - " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n", - "\n", - "TL;DR We reduce WER from `14.3%` to `11.4%` by correcting medical terms, e.g.\n", - "* `puramesin` => `puromycin`\n", - "* `parromsin` => `puromycin`\n", - "* `and hydrod` => `anhydride`\n", - "* `lesh night and` => `lesch-nyhan`\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "agz8B2CxXBBG" - }, - "source": [ - "# Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "koRPpYISNPuH" - }, - "source": [ - "## Installing NeMo" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HCnnz3cgVc4Q" - }, - "outputs": [], - "source": [ - "# Install NeMo library. If you are running locally (rather than on Google Colab), comment out the below lines\n", - "# and instead follow the instructions at https://github.com/NVIDIA/NeMo#Installation\n", - "GITHUB_ACCOUNT = \"NVIDIA\"\n", - "BRANCH = 'main'\n", - "!python -m pip install git+https://github.com/{GITHUB_ACCOUNT}/NeMo.git@{BRANCH}#egg=nemo_toolkit[all]\n", - "\n", - "# Download local version of NeMo scripts. If you are running locally and want to use your own local NeMo code,\n", - "# comment out the below lines and set NEMO_DIR to your local path.\n", - "NEMO_DIR = 'nemo'\n", - "!git clone -b {BRANCH} https://github.com/{GITHUB_ACCOUNT}/NeMo.git $NEMO_DIR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_M92gCn_NW1_" - }, - "source": [ - "## Additional installs\n", - "We will use `sentence_splitter` to split abstracts to sentences." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ddyJA3NtGl9C" - }, - "outputs": [], - "source": [ - "!pip install sentence_splitter" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "qVa91rGkeFje" - }, - "source": [ - "Clone the SpellMapper model from HuggingFace.\n", - "Note that we will need not only the checkpoint itself, but also the ngram mapping vocabulary `replacement_vocab_filt.txt` from the same folder." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JiI9dkEm5cpW" - }, - "outputs": [], - "source": [ - "!git clone https://huggingface.co/bene-ges/spellmapper_asr_customization_en" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8saqFOePVfFf" - }, - "source": [ - "## Imports\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tAJyiYn_VnrF" - }, - "outputs": [], - "source": [ - "import IPython.display as ipd\n", - "import json\n", - "import random\n", - "import re\n", - "import soundfile as sf\n", - "import torch\n", - "\n", - "from collections import Counter, defaultdict\n", - "from difflib import SequenceMatcher\n", - "from matplotlib.pyplot import imshow\n", - "from matplotlib import pyplot as plt\n", - "from sentence_splitter import SentenceSplitter\n", - "from typing import List, Set, Tuple\n", - "\n", - "from nemo.collections.tts.models import FastPitchModel\n", - "from nemo.collections.tts.models import HifiGanModel\n", - "\n", - "from nemo.collections.asr.parts.utils.manifest_utils import read_manifest\n", - "\n", - "from nemo.collections.nlp.data.spellchecking_asr_customization.utils import (\n", - " get_all_candidates_coverage,\n", - " get_index,\n", - " load_ngram_mappings,\n", - " search_in_index,\n", - " get_candidates,\n", - " read_spellmapper_predictions,\n", - " apply_replacements_to_text,\n", - " load_ngram_mappings_for_dp,\n", - " get_alignment_by_dp,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mfAaOdAWUGUV" - }, - "source": [ - "Use seed to get a reproducible behaviour." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UlGnNKTuT_6A" - }, - "outputs": [], - "source": [ - "random.seed(0)\n", - "torch.manual_seed(0)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RPPHI7Zd_fDz" - }, - "source": [ - "## Download data\n", - "\n", - "File `pubmed24n0009.xml` taken from public ftp server of https://www.ncbi.nlm.nih.gov/pmc/ contains information about 5593 medical papers, from which we extract only their abstracts. We will feed sentences from there to TTS + ASR to get initial ASR results.\n", - "\n", - "File `wordlist.txt` contains 100k **single-word** medical terms.\n", - "\n", - "File `valid_adam.txt` contains 24k medical abbreviations with their full forms. We will use those full forms as examples of **multi-word** medical terms.\n", - "\n", - "File `count_1w.txt` contains 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mX6cvE8xw2n1" - }, - "outputs": [], - "source": [ - "!wget https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/pubmed24n0009.xml.gz\n", - "!gunzip pubmed24n0009.xml.gz\n", - "!grep \"AbstractText\" pubmed24n0009.xml > abstract.txt\n", - "\n", - "!wget https://raw.githubusercontent.com/McGill-NLP/medal/master/toy_data/valid_adam.txt\n", - "!wget https://raw.githubusercontent.com/glutanimate/wordlist-medicalterms-en/master/wordlist.txt\n", - "!wget https://norvig.com/ngrams/count_1w.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mBm9BeqNaRlC" - }, - "source": [ - "## Auxiliary functions\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kVUKhSh48Ypi" - }, - "outputs": [], - "source": [ - "CHARS_TO_IGNORE_REGEX = re.compile(r\"[\\.\\,\\?\\:!;()«»…\\]\\[/\\*–‽+&_\\\\½√>€™$•¼}{~—=“\\\"”″‟„]\")\n", - "\n", - "\n", - "def get_medical_vocabulary() -> Tuple[Set[str], Set[str]]:\n", - " \"\"\"This function builds a vocabulary of medical terms using downloaded sources:\n", - " wordlist.txt - 100k single-word medical terms.\n", - " valid_adam.txt - 24k medical abbreviations with their full forms. We use those full forms as examples of multi-word medical terms.\n", - " count_1w.txt - 330k single words with their frequencies from Google Ngrams corpus. We will use this file to filter out frequent words from our custom vocabulary.\n", - " \"\"\"\n", - " common_words = set()\n", - " with open(\"count_1w.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " word, freq = line.strip().casefold().split(\"\\t\")\n", - " if int(freq) < 500000:\n", - " break\n", - " common_words.add(word)\n", - " print(\"Size of common words vocabulary:\", len(common_words))\n", - "\n", - " abbreviations = defaultdict(set)\n", - " medical_vocabulary = set()\n", - " with open(\"valid_adam.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " lines = f.readlines()\n", - " # first line is header\n", - " for line in lines[1:]:\n", - " abbrev, _, phrase = line.strip().split(\"\\t\")\n", - " # skip phrases longer than 3 words because some of them are long explanations\n", - " if phrase.count(\" \") > 2:\n", - " continue\n", - " if phrase in common_words:\n", - " continue\n", - " medical_vocabulary.add(phrase)\n", - " abbrev = abbrev.lower()\n", - " abbreviations[abbrev].add(phrase)\n", - "\n", - " with open(\"wordlist.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " word = line.strip().casefold()\n", - " # skip words containing digits\n", - " if re.match(r\".*\\d.*\", word):\n", - " continue\n", - " if re.match(r\".*[\\[\\]\\(\\)\\+\\,\\.].*\", word):\n", - " continue\n", - " if word in common_words:\n", - " continue\n", - " medical_vocabulary.add(word)\n", - "\n", - " print(\"Size of medical vocabulary:\", len(medical_vocabulary))\n", - " print(\"Size of abbreviation vocabulary:\", len(abbreviations))\n", - " return medical_vocabulary, abbreviations\n", - "\n", - "\n", - "def read_abstracts(medical_vocabulary: Set[str]) -> Tuple[List[str], Set[str], Set[str]]:\n", - " \"\"\"This function reads the downloaded medical abstracts, and extracts sentences containing any word/phrase from the medical vocabulary.\n", - " Args:\n", - " medical_vocabulary: set of known medical words or phrases\n", - " Returns:\n", - " sentences: list of extracted sentences\n", - " all_found_singleword: set of single words from medical vocabulary that occurred at least in one sentence\n", - " all_found_multiword: set of multi-word phrases from medical vocabulary that occurred at least in one sentence\n", - " \"\"\"\n", - " splitter = SentenceSplitter(language='en')\n", - "\n", - " all_sentences = []\n", - " all_found_singleword = set()\n", - " all_found_multiword = set()\n", - " with open(\"abstract.txt\", \"r\", encoding=\"utf-8\") as f:\n", - " for line in f:\n", - " text = line.strip().replace(\"\", \"\").replace(\"\", \"\")\n", - " sents = splitter.split(text)\n", - " found_singleword = set()\n", - " found_multiword = set()\n", - " for sent in sents:\n", - " # remove anything in brackets from text\n", - " sent = re.sub(r\"\\(.+\\)\", r\"\", sent)\n", - " # remove quotes from text\n", - " sent = sent.replace(\"\\\"\", \"\")\n", - " # skip sentences containing digits because normalization is out of scope of this tutorial\n", - " if re.match(r\".*\\d.*\", sent):\n", - " continue\n", - " # skip sentences containing abbreviations with period inside the sentence (for the same reason)\n", - " if \". \" in sent:\n", - " continue\n", - " # skip long sentences as they may cause OOM issues\n", - " if len(sent) > 150:\n", - " continue\n", - " # replace all punctuation to space and convert to lowercase\n", - " sent_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", sent).lower()\n", - " sent_clean = \" \".join(sent_clean.split(\" \"))\n", - " words = sent_clean.split(\" \")\n", - "\n", - " found_phrases = set()\n", - " for begin in range(len(words)):\n", - " for end in range(begin + 1, min(begin + 4, len(words))):\n", - " phrase = \" \".join(words[begin:end])\n", - " if phrase in medical_vocabulary:\n", - " found_phrases.add(phrase)\n", - " if end - begin == 1:\n", - " found_singleword.add(phrase)\n", - " else:\n", - " found_multiword.add(phrase)\n", - " if len(found_phrases) > 0:\n", - " all_sentences.append((sent, \";\".join(found_phrases)))\n", - " all_found_singleword = all_found_singleword.union(found_singleword)\n", - " all_found_multiword = all_found_multiword.union(found_multiword)\n", - "\n", - " print(\"Sentences:\", len(all_sentences))\n", - " print(\"Unique single-word terms found:\", len(all_found_singleword))\n", - " print(\"Unique multi-word terms found:\", len(all_found_multiword))\n", - " print(\"Examples of multi-word terms\", str(list(all_found_multiword)[0:10]))\n", - " \n", - " return all_sentences, all_found_singleword, all_found_multiword" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XU3xeCBVpWOL" - }, - "outputs": [], - "source": [ - "def get_fragments(i_words: List[str], j_words: List[str]) -> List[Tuple[str, str, str, int, int, int, int]]:\n", - " \"\"\"This function is used to compare two word sequences to find minimal fragments that differ.\n", - " Args:\n", - " i_words: list of words in first sequence\n", - " j_words: list of words in second sequence\n", - " Returns:\n", - " list of tuples (difference_type, fragment1, fragment2, begin_of_fragment1, end_of_fragment1, begin_of_fragment2, end_of_fragment2)\n", - " \"\"\"\n", - " s = SequenceMatcher(None, i_words, j_words)\n", - " result = []\n", - " for tag, i1, i2, j1, j2 in s.get_opcodes():\n", - " result.append((tag, \" \".join(i_words[i1:i2]), \" \".join(j_words[j1:j2]), i1, i2, j1, j2))\n", - " result = sorted(result, key=lambda x: x[3])\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2ydXp_pFYmYu" - }, - "source": [ - "## Read medical data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WAeauax0SV1-" - }, - "outputs": [], - "source": [ - "medical_vocabulary, _ = get_medical_vocabulary()\n", - "sentences, found_singleword, found_multiword = read_abstracts(medical_vocabulary)\n", - "# in case if we need random candidates from a big sample - we will use full medical vocabulary for that purpose.\n", - "big_sample = list(medical_vocabulary)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "FRli7-Kx7sOO" - }, - "outputs": [], - "source": [ - "for sent, phrases in sentences[0:10]:\n", - " print(sent, \"\\t\", phrases)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "rL1VqH2_dk93" - }, - "source": [ - "# SpellMapper ASR Customization\n", - "\n", - "SpellMapper model relies on two offline preparation steps:\n", - "1. Collecting n-gram mappings from a large corpus (this mappings vocabulary had been collected once on a large corpus and is supplied with the model).\n", - "2. Indexing of user vocabulary by n-grams.\n", - "\n", - "![Offline data preparation](images/spellmapper_data_preparation.png)\n", - "\n", - "At inference time we take as input an ASR hypothesis and an n-gram-indexed user vocabulary and perform following steps:\n", - "1. Retrieve the top 10 candidate phrases from the user vocabulary that are likely to be contained in the given ASR-hypothesis, possibly in a misspelled form.\n", - "2. Run the neural model that tags the input characters with correct candidate labels or 0 if no match is found.\n", - "3. Do post-processing to combine results.\n", - "\n", - "![Inference pipeline](images/spellmapper_inference_pipeline.png)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "OeJpsMwslmrd" - }, - "source": [ - "## N-gram mappings\n", - "Note that n-gram mappings vocabulary had been collected from a large corpus and is supplied with the model. It is supposed to be \"universal\" for English language.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uH6p0mOd12pi" - }, - "source": [ - "Let's see what n-gram mappings are like, for example, for an n-gram `l u c`.\n", - "Note that n-grams in `replacement_vocab_filt.txt` preserve one-to-one correspondence between original letters and misspelled fragments (this additional markup is handled during loading). \n", - "* `+` means that adjacent letters are concatenated and correspond to a single source letter. \n", - "* `` means that the original letter is deleted. \n", - "This auxiliary markup will be removed automatically during loading.\n", - "\n", - "`_` is used instead of real space symbol.\n", - "\n", - "Last three columns are:\n", - "* joint frequency\n", - "* frequency of original n-gram\n", - "* frequency of misspelled n-gram\n", - "\n", - "$$\\frac{JointFrequency}{SourceFrequency}=TranslationProbability$$\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qul163dB1sKp" - }, - "outputs": [], - "source": [ - "!awk 'BEGIN {FS=\"\\t\"} ($1==\"l u c\"){print $0}' < spellmapper_asr_customization_en/replacement_vocab_filt.txt | sort -t$'\\t' -k3nr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eWxcrVWZ3Pfq" - }, - "source": [ - "Now we read n-gram mappings from the file. Parameter `max_misspelled_freq` controls maximum frequency of misspelled n-grams. N-grams more frequent than that are put in the list of banned n-grams and won't be used in indexing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WHKhE945-N7o" - }, - "outputs": [], - "source": [ - "print(\"load n-gram mappings...\")\n", - "ngram_mapping_vocab, ban_ngram = load_ngram_mappings(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\", max_misspelled_freq=125000)\n", - "# CAUTION: entries in ban_ngram end with a space and can contain \"+\" \"=\"\n", - "print(\"Size of ngram mapping vocabulary:\", len(ngram_mapping_vocab))\n", - "print(\"Size of banned ngrams:\", len(ban_ngram))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "49IcMBfllvXN" - }, - "source": [ - "## Indexing of custom vocabulary" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "b1K6paeee2Iu" - }, - "source": [ - "As we mentioned earlier, this model pipeline is intended to work with custom vocabularies up to several thousand entries. Since the whole medical vocabulary contains 110k entries, we restrict our custom vocabulary to 5000+ terms that occurred in given corpus of abstracts.\n", - "\n", - "The goal of indexing our custom vocabulary is to build an index where key is a letter n-gram and value is the whole phrase. The keys are n-grams in the given user phrase and their misspelled variants taken from our collection of n-\n", - "gram mappings (see Index of custom vocabulary in Fig. 1)\n", - "\n", - "*Though it is possible to index and search the whole 110k vocabulary, it will require additional optimizations and is beyond the scope of this tutorial.*" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xWb0jGqw6Woi" - }, - "outputs": [], - "source": [ - "custom_phrases = []\n", - "for phrase in medical_vocabulary:\n", - " if phrase not in found_singleword and phrase not in found_multiword:\n", - " continue\n", - " custom_phrases.append(\" \".join(list(phrase.replace(\" \", \"_\"))))\n", - "print(\"Size of customization vocabulary:\", len(custom_phrases))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UHWor5pD2Eyb" - }, - "source": [ - "Now we build the index for our custom phrases.\n", - "\n", - "Parameter `min_log_prob` controls minimum log probability, after which we stop growing this n-gram.\n", - "\n", - "Parameter `max_phrases_per_ngram` controls maximum number of phrases that can be indexed by one ngram. N-grams exceeding this limit are also banned and not used in indexing.\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hs4RDXj0-xW9" - }, - "outputs": [], - "source": [ - "phrases, ngram2phrases = get_index(custom_phrases, ngram_mapping_vocab, ban_ngram, min_log_prob=-4.0, max_phrases_per_ngram=600)\n", - "print(\"Size of phrases:\", len(phrases))\n", - "print(\"Size of ngram2phrases:\", len(ngram2phrases))\n", - "\n", - "# Save index to file - later we will use it in other script\n", - "with open(\"index.txt\", \"w\", encoding=\"utf-8\") as out:\n", - " for ngram in ngram2phrases:\n", - " for phrase_id, begin, size, logprob in ngram2phrases[ngram]:\n", - " phrase = phrases[phrase_id]\n", - " out.write(ngram + \"\\t\" + phrase + \"\\t\" + str(begin) + \"\\t\" + str(size) + \"\\t\" + str(logprob) + \"\\n\")\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RV1sdQ9rvar8" - }, - "source": [ - "## Small detailed example\n", - "\n", - "Let's consider, for example, one custom phrase `thoracic aorta` and an incorrect ASR-hypothesis `the tarasic oorda is a part of the aorta located in the thorax`, containing a misspelled phrase `tarasic_oorda`. \n", - "\n", - "We will see \n", - "1. How this custom phrase is indexed.\n", - "2. How candidate retrieval works, given ASR-hypothesis.\n", - "3. How inference and post-processing work.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kGBTTJXixnrG" - }, - "source": [ - "### N-grams in index" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ryfUlqNMl4vQ" - }, - "source": [ - "Let's look, for example, by what n-grams a custom phrase `thoracic aorta` is indexed. \n", - "Columns: \n", - "1. n-gram\n", - "2. beginning position in the phrase\n", - "3. length\n", - "4. log probability\n", - "\n", - "Note that many n-grams are not from n-gram mappings file. Those are derived by growing previous n-grams with new replacements. In this case log probabilities are summed up. Growing stops, when minimum log prob is exceeded.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "x0ZVsXGBo8pt" - }, - "outputs": [], - "source": [ - "for ngram in ngram2phrases:\n", - " for phrase_id, b, length, lprob in ngram2phrases[ngram]:\n", - " if phrases[phrase_id] == \"t h o r a c i c _ a o r t a\":\n", - " print(ngram.ljust(16) + \"\\t\" + str(b).rjust(4) + \"\\t\" + str(length).rjust(4) + \"\\t\" + str(lprob))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "20ov23ze4xeQ" - }, - "source": [ - "### Candidate retrieval\n", - "Candidate retrieval tasks are:\n", - " - Given an input sentence and an index of custom vocabulary find all n-grams from the index matching the sentence. \n", - " - Find which sentence fragments and which custom phrases have most \"hits\" - potential candidates.\n", - " - Find approximate starting position for each candidate phrase. \n", - "\n", - "\n", - "Let's look at the hits, that phrase \"thoracic aorta\" gets by searching all ngrams in the input text. We can see some hits in different part of the sentence, but a moving window can find a fragment with most hits." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "t_rhKQ3Xqa8A" - }, - "outputs": [], - "source": [ - "sent = \"the_tarasic_oorda_is_a_part_of_the_aorta_located_in_the_thorax\"\n", - "phrases2positions, position2ngrams = search_in_index(ngram2phrases, phrases, sent)\n", - "print(\" \".join(list(sent)))\n", - "print(\" \".join(list(map(str, phrases2positions[phrases.index(\"t h o r a c i c _ a o r t a\")].astype(int)))))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "orkRapbjF4aZ" - }, - "source": [ - "`phrases2positions` is a matrix of size (len(phrases), len(ASR_hypothesis)).\n", - "It is filled with 1.0 (hits) on intersection of letter n-grams and phrases that are indexed by these n-grams, 0.0 - elsewhere.\n", - "It is used to find phrases with many hits within a contiguous window - potential matching candidates.\n", - "\n", - "`position2ngrams` is a list of sets of ngrams. List index is the starting position in the ASR-hypothesis.\n", - "It is used later to check how well each found candidate is covered by n-grams (to avoid cases where some repeating n-gram gives many hits to a phrase, but the phrase itself is not well covered)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JF7u4_iiHLyI" - }, - "outputs": [], - "source": [ - "candidate2coverage, candidate2position = get_all_candidates_coverage(phrases, phrases2positions)\n", - "print(\"Coverage=\", candidate2coverage[phrases.index(\"t h o r a c i c _ a o r t a\")])\n", - "print(\"Starting position=\", candidate2position[phrases.index(\"t h o r a c i c _ a o r t a\")])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "45mvKg8ZyNbr" - }, - "source": [ - "`candidate2coverage` is a list of size len(phrases) containing coverage (0.0 to 1.0) in best window.\n", - "Coverage is a smoothed percentage of hits in the window of size of the given phrase.\n", - "\n", - "`candidate2position` is a list of size len(phrases) containing starting position of best window.\n", - "\n", - "Starting position is approximate, it's ok. If it is not at the beginning of some word, SpellMapper will try to adjust it later. In this particular example we get 5 as starting position instead of 4, missing the first letter." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Sjyn9I98udL9" - }, - "source": [ - "### Inference\n", - "\n", - "Now let's generate input for SpellMapper inference. \n", - "An input line should consist of 4 tab-separated columns:\n", - " - text of ASR-hypothesis\n", - " - texts of 10 candidates separated by semicolon\n", - " - 1-based ids of non-dummy candidates\n", - " - approximate start/end coordinates of non-dummy candidates (correspond to ids)\n", - "Note that candidate retrieval is done inside the function `get_candidates`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "cJnusVfBRhRX" - }, - "outputs": [], - "source": [ - "out = open(\"spellmapper_input.txt\", \"w\", encoding=\"utf-8\")\n", - "letters = list(sent)\n", - "candidates = get_candidates(ngram2phrases, phrases, letters, big_sample)\n", - "# We add two columns with targets and span_info. \n", - "# They have same format as during training, but start and end positions are APPROXIMATE, they will be adjusted when constructing BertExample.\n", - "targets = []\n", - "span_info = []\n", - "for idx, c in enumerate(candidates):\n", - " if c[1] == -1:\n", - " continue\n", - " targets.append(str(idx + 1)) # targets are 1-based\n", - " start = c[1]\n", - " end = min(c[1] + c[2], len(letters)) # ensure that end is not outside sentence length (it can happen because c[2] is candidate length used as approximation)\n", - " span_info.append(\"CUSTOM \" + str(start) + \" \" + str(end))\n", - "\n", - "out.write(\" \".join(letters) + \"\\t\" + \";\".join([x[0] for x in candidates]) + \"\\t\" + \" \".join(targets) + \"\\t\" + \";\".join(span_info) + \"\\n\")\n", - "out.close()\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Qpei5o89SmaU" - }, - "outputs": [], - "source": [ - "!cat spellmapper_input.txt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9rAmO15SS6go" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", - " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", - " model.max_sequence_len=512 \\\n", - " inference.from_file=spellmapper_input.txt \\\n", - " inference.out_file=spellmapper_output.txt \\\n", - " inference.batch_size=16 \\\n", - " lang=en\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "wd2aq4T1N5cs" - }, - "source": [ - "Each line in SpellMapper output is tab-separated and consists of 4 columns:\n", - "1. ASR-hypothesis (same as in input)\n", - "2. 10 candidates separated with semicolon (same as in input)\n", - "3. fragment predictions, separated with semicolon, each prediction is a tuple (start, end, candidate_id, probability)\n", - "4. letter predictions - candidate_id predicted for each letter (this is only for debug purposes)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ravgEX8cTFty" - }, - "outputs": [], - "source": [ - "!cat spellmapper_output.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "az26364-PHb2" - }, - "source": [ - "We can use some utility functions to apply found replacements and get actual corrected text." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lPtFa_EhK8pb" - }, - "outputs": [], - "source": [ - "spellmapper_results = read_spellmapper_predictions(\"spellmapper_output.txt\")\n", - "text, replacements, _ = spellmapper_results[0]\n", - "corrected_text = apply_replacements_to_text(text, replacements, replace_hyphen_to_space=False)\n", - "print(\"Text before correction:\\n\", text)\n", - "print(\"Text after correction:\\n\", corrected_text)\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "efF7O-D91FLX" - }, - "source": [ - "# Bigger customization example\n", - "\n", - "Let's test customization on more data. The plan is\n", - " * Get baseline ASR transcriptions by running TTS + ASR on some medical paper abstracts.\n", - " * Run SpellMapper inference and show how it can improve ASR results using custom vocabulary.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r_EFPnyDcXZt" - }, - "source": [ - "## Run TTS" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "i9F5SBhmr8rk" - }, - "outputs": [], - "source": [ - "# create a folder for wav files (TTS output)\n", - "!rm -r audio\n", - "!mkdir audio" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JMbkNVt7YBAO" - }, - "outputs": [], - "source": [ - "if torch.cuda.is_available():\n", - " device = \"cuda\"\n", - "else:\n", - " device = \"cpu\"\n", - "\n", - "# Load FastPitch from HuggingFace\n", - "spectrogram_generator = FastPitchModel.from_pretrained(\"nvidia/tts_en_fastpitch\").eval().to(device)\n", - "# Load HifiGan vocoder from HuggingFace\n", - "vocoder = HifiGanModel.from_pretrained(model_name=\"nvidia/tts_hifigan\").eval().to(device)\n", - "\n", - "# Write sentences that we want to feed to TTS\n", - "with open(\"tts_input.txt\", \"w\", encoding=\"utf-8\") as out:\n", - " for sent, _ in sentences[0:100]:\n", - " out.write(sent + \"\\n\")\n", - "\n", - "out_manifest = open(\"manifest.json\", \"w\", encoding=\"utf-8\")\n", - "i = 0\n", - "with open(\"tts_input.txt\", \"r\", encoding=\"utf-8\") as inp:\n", - " for line in inp:\n", - " text = line.strip()\n", - " text_clean = CHARS_TO_IGNORE_REGEX.sub(\" \", text).lower() #replace all punctuation to space and convert to lowercase\n", - " text_clean = \" \".join(text_clean.split())\n", - "\n", - " parsed = spectrogram_generator.parse(text, normalize=True)\n", - "\n", - " spectrogram = spectrogram_generator.generate_spectrogram(tokens=parsed)\n", - " audio = vocoder.convert_spectrogram_to_audio(spec=spectrogram)\n", - "\n", - " # Note that vocoder return a batch of audio. In this example, we just take the first and only sample.\n", - " filename = \"audio/\" + str(i) + \".wav\"\n", - " sf.write(filename, audio.to('cpu').detach().numpy()[0], 16000)\n", - " out_manifest.write(\n", - " \"{\\\"audio_filepath\\\": \\\"\" + filename + \"\\\", \\\"text\\\": \\\"\" + text_clean + \"\\\", \\\"orig_text\\\": \\\"\" + text + \"\\\"}\\n\"\n", - " )\n", - " i += 1\n", - "\n", - " # display some examples\n", - " if i < 10:\n", - " print(f'\"{text}\"\\n')\n", - " ipd.display(ipd.Audio(audio.to('cpu').detach(), rate=22050))\n", - "\n", - "out_manifest.close()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9T3CZcCAmxCz" - }, - "source": [ - "Now we have a folder with generated audios `audio/*.wav` and a nemo manifest with json records like `{\"audio_filepath\": \"audio/0.wav\", \"text\": \"no renal auditory or vestibular toxicity was observed\", \"orig_text\": \"No renal, auditory, or vestibular toxicity was observed.\"}`.", - "\n", - "Note that TTS model may mispronounce some unknown words, for example, abbreviations like `tRNAs`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pR_T1HnttVjm" - }, - "outputs": [], - "source": [ - "lines = []\n", - "with open(\"manifest.json\", \"r\", encoding=\"utf-8\") as f:\n", - " lines = f.readlines()\n", - "\n", - "for line in lines:\n", - " try:\n", - " data = json.loads(line.strip())\n", - " except:\n", - " print(line)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bt2TMLLvdUHm" - }, - "source": [ - "Free GPU memory to avoid OOM." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZwEpAOCaRH7s" - }, - "outputs": [], - "source": [ - "del spectrogram_generator\n", - "del vocoder\n", - "torch.cuda.empty_cache()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "HrensakWdLkt" - }, - "source": [ - "## Run baseline ASR" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IQNIo2M_mqJc" - }, - "source": [ - "Next we transcribe our .wav files with a general domain [ASR model](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/nemo/models/stt_en_conformer_ctc_large). It will generate an output file `ctc_baseline_transcript.json` where the predicted transcriptions are stored in the field `pred_text` of each record.\n", - "\n", - "Note that this ASR model was not trained or fine-tuned on medical domain, so we expect it to make mistakes on medical terms." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NMN63ux1mJiG" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/transcribe_speech.py \\\n", - " pretrained_name=\"stt_en_conformer_ctc_large\" \\\n", - " dataset_manifest=manifest.json \\\n", - " output_filename=ctc_baseline_transcript_tmp.json \\\n", - " batch_size=2" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L3swQ8uqqgnp" - }, - "source": [ - "ATTENTION: SpellMapper relies on words to be separated by _single_ space\n", - "\n", - "There is a bug with multiple space, observed in ASR results produced by Conformer-CTC, probably connected to this issue: https://github.com/NVIDIA/NeMo/issues/4034.\n", - "\n", - "So we need to correct the manifests to ensure that all spaces are single." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z17sxkmXrXpJ" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_baseline_transcript_tmp.json\")\n", - "\n", - "for i in range(len(test_data)):\n", - " # if there are multiple spaces in the string they will be merged to one\n", - " test_data[i][\"pred_text\"] = \" \".join(test_data[i][\"pred_text\"].split())\n", - "\n", - "with open(\"ctc_baseline_transcript.json\", \"w\", encoding=\"utf-8\") as out:\n", - " for d in test_data:\n", - " line = json.dumps(d)\n", - " out.write(line + \"\\n\")\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PuKtfhbVkVJY" - }, - "outputs": [], - "source": [ - "!head -n 4 ctc_baseline_transcript.json" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "aCJw9NEXqRg8" - }, - "source": [ - "### Calculating WER of baseline transcript\n", - "We use the standard script from NeMo to calculate WER and CER of our baseline transcript. Internally it compares the text in `pred_text` (predicted transcript) to `text` (reference transcript). " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZmNEGVWQsGo2" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_baseline_transcript.json \\\n", - " only_score_manifest=True\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "AvPwJr0ZqdkN" - }, - "source": [ - "### See fragments that differ\n", - "We use SequenceMatcher to see fragments that differ. (Another option is to use a more powerful analytics tool [Speech Data Explorer](https://docs.nvidia.com/deeplearning/nemo/user-guide/docs/en/main/tools/speech_data_explorer.html))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RAeaVCpMv78y" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_baseline_transcript.json\")\n", - "pred_text = [data['pred_text'] for data in test_data]\n", - "ref_text = [data['text'] for data in test_data]\n", - "audio_filepath = [data['audio_filepath'] for data in test_data]\n", - "\n", - "diff_vocab = Counter()\n", - "\n", - "for i in range(len(test_data)):\n", - " ref_sent = \" \" + ref_text[i] + \" \"\n", - " pred_sent = \" \" + pred_text[i] + \" \"\n", - "\n", - " pred_words = pred_sent.strip().split()\n", - " ref_words = ref_sent.strip().split()\n", - "\n", - " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", - " if tag != \"equal\":\n", - " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", - "\n", - "sum_ = 0\n", - "print(\"PRED vs REF\")\n", - "for k, v in diff_vocab.most_common(1000000):\n", - " sum_ += v\n", - " print(k, v, \"sum=\", sum_)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dUSOF7iD1w_9" - }, - "source": [ - "## Run SpellMapper" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "x39BQhYB6_Fr" - }, - "source": [ - "Now we run retrieval on our input manifest and prepare input for SpellMapper inference. Note that we use index of custom vocabulary (file `index.txt` that we saved earlier)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y8x-yT5WqfFz" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/prepare_input_from_manifest.py \\\n", - " --manifest ctc_baseline_transcript.json \\\n", - " --custom_vocab_index index.txt \\\n", - " --big_sample spellmapper_asr_customization_en/big_sample.txt \\\n", - " --short2full_name short2full.txt \\\n", - " --output_name spellmapper_input.txt" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "ueq_JAPWGs_Y" - }, - "source": [ - "Run the inference." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zgkqiiZtJjcB" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/spellchecking_asr_customization_infer.py \\\n", - " pretrained_model=spellmapper_asr_customization_en/training_10m_5ep.nemo \\\n", - " model.max_sequence_len=512 \\\n", - " inference.from_file=spellmapper_input.txt \\\n", - " inference.out_file=spellmapper_output.txt \\\n", - " inference.batch_size=16 \\\n", - " lang=en\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RPQWJX8dFLfX" - }, - "source": [ - "Now we postprocess SpellMapper output and create output corrected manifest." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3eFU515yKvXP" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", - " --input_manifest ctc_baseline_transcript.json \\\n", - " --short2full_name short2full.txt \\\n", - " --output_manifest ctc_corrected_transcript.json \\\n", - " --spellmapper_result spellmapper_output.txt \\\n", - " --replace_hyphen_to_space \\\n", - " --field_name pred_text \\\n", - " --ngram_mappings \"\"\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hRoIhhGh17tp" - }, - "source": [ - "### Calculating WER of corrected transcript." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qIT957bGo9AY" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_corrected_transcript.json \\\n", - " only_score_manifest=True\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NYXIPusupqOQ" - }, - "outputs": [], - "source": [ - "test_data = read_manifest(\"ctc_corrected_transcript.json\")\n", - "pred_text = [data['pred_text'] for data in test_data]\n", - "ref_text = [data['pred_text_before_correction'] for data in test_data]\n", - "\n", - "diff_vocab = Counter()\n", - "\n", - "for i in range(len(test_data)):\n", - " ref_sent = \" \" + ref_text[i] + \" \"\n", - " pred_sent = \" \" + pred_text[i] + \" \"\n", - "\n", - " pred_words = pred_sent.strip().split()\n", - " ref_words = ref_sent.strip().split()\n", - "\n", - " for tag, hyp_fragment, ref_fragment, i1, i2, j1, j2 in get_fragments(pred_words, ref_words):\n", - " if tag != \"equal\":\n", - " diff_vocab[(tag, hyp_fragment, ref_fragment)] += 1\n", - "\n", - "sum_ = 0\n", - "print(\"Corrected vs baseline\")\n", - "for k, v in diff_vocab.most_common(1000000):\n", - " sum_ += v\n", - " print(k, v, \"sum=\", sum_)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DJtXlqXbTD6M" - }, - "source": [ - "### Filtering by Dynamic Programming(DP) score\n", - "\n", - "What else can be done?\n", - "Given a fragment and its potential replacement, we can apply **dynamic programming** to find the most probable \"translation\" path between them. We will use the same n-gram mapping vocabulary, because its frequencies give us \"translation probability\" of each n-gram pair. The final path score can be calculated as maximum sum of log probabilities of matching n-grams along this path.\n", - "Let's look at an example. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "05Qf9wgHU_UR" - }, - "outputs": [], - "source": [ - "joint_vocab, orig_vocab, misspelled_vocab, max_len = load_ngram_mappings_for_dp(\"spellmapper_asr_customization_en/replacement_vocab_filt.txt\")\n", - "\n", - "fragment = \"and hydrod\"\n", - "replacement = \"anhydride\"\n", - "fragment_spaced = \" \".join(list(fragment.replace(\" \", \"_\")))\n", - "replacement_spaced = \" \".join(list(replacement.replace(\" \", \"_\")))\n", - "path = get_alignment_by_dp(\n", - " replacement_spaced,\n", - " fragment_spaced,\n", - " dp_data=(joint_vocab, orig_vocab, misspelled_vocab, max_len)\n", - ")\n", - "print(\"Dynamic Programming path:\")\n", - "for fragment_ngram, replacement_ngram, score, sum_score, joint_freq, orig_freq, misspelled_freq in path:\n", - " print(\n", - " \"\\t\",\n", - " \"frag=\",\n", - " fragment_ngram,\n", - " \"; repl=\",\n", - " replacement_ngram,\n", - " \"; score=\",\n", - " score,\n", - " \"; sum_score=\",\n", - " sum_score,\n", - " \"; joint_freq=\",\n", - " joint_freq,\n", - " \"; orig_freq=\",\n", - " orig_freq,\n", - " \"; misspelled_freq=\",\n", - " misspelled_freq,\n", - " )\n", - "\n", - "print(\"Final path score is in path[-1][3]: \", path[-1][3])\n", - "print(\"Dynamic programming(DP) score per symbol is final score divided by len(fragment): \", path[-1][3] / (len(fragment)))\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "hgfKPKckaLnc" - }, - "source": [ - "The idea is that we can skip replacements whose average DP score per symbol is below some predefined minimum, say -1.5.\n", - "Note that dynamic programming works slow because of quadratic complexity, but it allows to get rid of some false positives. Let's apply it on the same test set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UhSXh7ht_JRn" - }, - "outputs": [], - "source": [ - "!python nemo/examples/nlp/spellchecking_asr_customization/postprocess_and_update_manifest.py \\\n", - " --input_manifest ctc_baseline_transcript.json \\\n", - " --short2full_name short2full.txt \\\n", - " --output_manifest ctc_corrected_transcript_dp.json \\\n", - " --spellmapper_result spellmapper_output.txt \\\n", - " --replace_hyphen_to_space \\\n", - " --field_name pred_text \\\n", - " --use_dp \\\n", - " --ngram_mappings spellmapper_asr_customization_en/replacement_vocab_filt.txt \\\n", - " --min_dp_score_per_symbol -1.5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "u8R5YHB3vPC8" - }, - "outputs": [], - "source": [ - "!python nemo/examples/asr/speech_to_text_eval.py \\\n", - " dataset_manifest=ctc_corrected_transcript_dp.json \\\n", - " only_score_manifest=True" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "upvTbkFAeYtR" - }, - "source": [ - "# Final notes\n", - "1. Bash-script with example of inference pipeline [run_infer.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_infer.sh)\n", - "\n", - "2. Check our paper: [SpellMapper: A non-autoregressive neural spellchecker for ASR customization with candidate retrieval based on n-gram mappings](https://arxiv.org/abs/2306.02317)\n", - "\n", - "3. To reproduce evaluation experiments from this paper see these scripts:\n", - " - [test_on_kensho.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - " - [test_on_userlibri.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - " - [test_on_spoken_wikipedia.sh](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/evaluation/test_on_kensho.sh)\n", - "\n", - "4. To reproduce creation of training data see [README.md](https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/README.md)\n", - "\n", - "5. To run training see [run_training.sh](https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/spellchecking_asr_customization/run_training.sh)\n", - "\n", - "6. Promising future research directions would be:\n", - " - add a simple trainable classifier on top of SpellMapper predictions instead of using multiple thresholds\n", - " - retrain with adding more various false positives to the training data" - ] - } - ], - "metadata": { - "accelerator": "GPU", - "colab": { - "gpuType": "T4", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "name": "python" - } - }, - "nbformat": 4, - "nbformat_minor": 0 -}