Check dynamo graph-breaks in CI (#96346)

- add graph-breaks baselines - add check_graph_breaks script (message users on regress or improvement) - hook up test.sh for existing accuracy job Refactor graph-break CI check Take steps toward merging checker with existing check flow, consider merging it all the way inside the bench runner. csvs Pull Request resolved: pytorch/pytorch#96346 Approved by: https://github.com/ezyang
cyyever · Mar 23, 2023 · 8fe782c · 8fe782c
1 parent 193bf71
commit 8fe782c
Show file tree

Hide file tree

Showing 10 changed files with 377 additions and 1 deletion.
diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -320,6 +320,12 @@ test_single_dynamo_benchmark() {
       --output "$TEST_REPORTS_DIR/${name}_${suite}.csv"
     python benchmarks/dynamo/check_csv.py \
       -f "$TEST_REPORTS_DIR/${name}_${suite}.csv"
+    if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]] && [[ "${TEST_CONFIG}" != *dynamic* ]]; then
+      # because I haven't tracked the cpu-side or dynamic expected artifacts yet, and need to differentiate filenames
+      python benchmarks/dynamo/check_graph_breaks.py \
+        --actual "$TEST_REPORTS_DIR/${name}_$suite.csv" \
+        --expected "benchmarks/dynamo/ci_expected_accuracy/${name}_${suite}${shard_id}.csv"
+    fi
   fi
 }
 
@@ -341,7 +347,6 @@ test_dynamo_benchmark() {
   else
     # Check inference with --float32
     test_single_dynamo_benchmark "inference" "$suite" "$shard_id" --float32 "$@"
-
     if [[ "${TEST_CONFIG}" != *cpu_accuracy* ]]; then
       # Check training with --amp
       test_single_dynamo_benchmark "training" "$suite" "$shard_id" --training --amp "$@"

diff --git a/benchmarks/dynamo/check_graph_breaks.py b/benchmarks/dynamo/check_graph_breaks.py
@@ -0,0 +1,83 @@
+import argparse
+import sys
+import textwrap
+
+import pandas as pd
+
+
+def get_field(csv, model_name: str, field: str, typ=float):
+    return typ(csv.loc[csv["name"] == model_name][field])
+
+
+def check_graph_breaks(actual_csv, expected_csv, expected_filename):
+
+    failed = []
+    improved = []
+
+    for model in actual_csv["name"]:
+
+        graph_breaks = get_field(actual_csv, model, "graph_breaks", typ=int)
+        expected_graph_breaks = get_field(expected_csv, model, "graph_breaks", typ=int)
+
+        if graph_breaks == expected_graph_breaks:
+            status = "PASS"
+        elif graph_breaks > expected_graph_breaks:
+            status = "FAIL"
+            failed.append(model)
+        elif graph_breaks < expected_graph_breaks:
+            status = "IMPROVED"
+            improved.append(model)
+        print(
+            f"""
+            {model:34}:
+                graph_breaks={graph_breaks},
+                expected_graph_breaks={expected_graph_breaks},
+                {status}
+            """
+        )
+
+    msg = ""
+    if failed or improved:
+        if failed:
+            msg += textwrap.dedent(
+                f"""
+            Error: {len(failed)} models have new dynamo graph breaks:
+                {' '.join(failed)}
+
+            """
+            )
+        if improved:
+            msg += textwrap.dedent(
+                f"""
+            Improvement: {len(improved)} models have fixed dynamo graph breaks:
+                {' '.join(improved)}
+
+            """
+            )
+        msg += textwrap.dedent(
+            f"""
+        If this change is expected, you can update `{expected_filename}` to reflect the new baseline.
+        This can either be done manually, or by downloading artifacts from your PR CI job.
+        (Search artifacts files for test-reports-test-inductor_torchbench, _timm, _huggingface)
+        """
+        )
+    return failed or improved, msg
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--actual", type=str, required=True)
+    parser.add_argument("--expected", type=str, required=True)
+    args = parser.parse_args()
+
+    actual = pd.read_csv(args.actual)
+    expected = pd.read_csv(args.expected)
+
+    failed, msg = check_graph_breaks(actual, expected, args.expected)
+    if failed:
+        print(msg)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inference_huggingface.csv b/benchmarks/dynamo/ci_expected_accuracy/inference_huggingface.csv
@@ -0,0 +1,43 @@
+name,graph_breaks
+AlbertForMaskedLM,0
+AlbertForQuestionAnswering,0
+BartForCausalLM,7
+BertForMaskedLM,0
+BertForQuestionAnswering,0
+BlenderbotForCausalLM,0
+BlenderbotSmallForCausalLM,7
+BlenderbotSmallForConditionalGeneration,0
+CamemBert,0
+DebertaForMaskedLM,47
+DebertaForQuestionAnswering,47
+DebertaV2ForMaskedLM,0
+DistilBertForMaskedLM,0
+DistilBertForQuestionAnswering,0
+DistillGPT2,0
+ElectraForCausalLM,3
+ElectraForQuestionAnswering,0
+GPT2ForSequenceClassification,1
+GoogleFnet,41
+LayoutLMForMaskedLM,0
+LayoutLMForSequenceClassification,1
+M2M100ForConditionalGeneration,13
+MBartForCausalLM,7
+MBartForConditionalGeneration,0
+MT5ForConditionalGeneration,0
+MegatronBertForCausalLM,0
+MegatronBertForQuestionAnswering,0
+MobileBertForMaskedLM,0
+MobileBertForQuestionAnswering,0
+PLBartForCausalLM,7
+PLBartForConditionalGeneration,10
+PegasusForCausalLM,8
+PegasusForConditionalGeneration,11
+RobertaForCausalLM,0
+RobertaForQuestionAnswering,0
+Speech2Text2ForCausalLM,8
+T5ForConditionalGeneration,0
+T5Small,0
+TrOCRForCausalLM,7
+XGLMForCausalLM,8
+XLNetLMHeadModel,0
+YituTechConvBert,3
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inference_timm_models0.csv b/benchmarks/dynamo/ci_expected_accuracy/inference_timm_models0.csv
@@ -0,0 +1,29 @@
+name,graph_breaks
+adv_inception_v3,0
+beit_base_patch16_224,0
+coat_lite_mini,0
+convit_base,15
+convmixer_768_32,0
+convnext_base,0
+crossvit_9_240,0
+cspdarknet53,0
+deit_base_distilled_patch16_224,0
+dla102,0
+dm_nfnet_f0,0
+dpn107,0
+eca_botnext26ts_256,0
+eca_halonext26ts,0
+ese_vovnet19b_dw,0
+fbnetc_100,0
+fbnetv3_b,0
+gernet_l,0
+ghostnet_100,0
+gluon_inception_v3,0
+gmixer_24_224,0
+gmlp_s16_224,0
+hrnet_w18,0
+inception_v3,0
+jx_nest_base,0
+lcnet_050,0
+levit_128,0
+mixer_b16_224,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inference_timm_models1.csv b/benchmarks/dynamo/ci_expected_accuracy/inference_timm_models1.csv
@@ -0,0 +1,31 @@
+name,graph_breaks
+mixnet_l,0
+mnasnet_100,0
+mobilenetv2_100,0
+mobilenetv3_large_100,0
+mobilevit_s,0
+nfnet_l0,0
+pit_b_224,0
+pnasnet5large,0
+poolformer_m36,0
+regnety_002,0
+repvgg_a2,0
+res2net101_26w_4s,0
+res2net50_14w_8s,0
+res2next50,0
+resmlp_12_224,0
+resnest101e,0
+rexnet_100,0
+sebotnet33ts_256,0
+selecsls42b,0
+spnasnet_100,0
+swin_base_patch4_window7_224,0
+swsl_resnext101_32x16d,0
+tf_efficientnet_b0,0
+tf_mixnet_l,0
+tinynet_a,0
+tnt_s_patch16_224,0
+twins_pcpvt_base,0
+visformer_small,0
+vit_base_patch16_224,0
+volo_d1_224,0
diff --git a/benchmarks/dynamo/ci_expected_accuracy/inference_torchbench.csv b/benchmarks/dynamo/ci_expected_accuracy/inference_torchbench.csv
@@ -0,0 +1,49 @@
+name,graph_breaks
+BERT_pytorch,0
+Background_Matting,0
+LearningToPaint,0
+Super_SloMo,0
+alexnet,0
+attention_is_all_you_need_pytorch,0
+dcgan,0
+densenet121,0
+dlrm,0
+drq,0
+fastNLP_Bert,4
+functorch_dp_cifar10,0
+functorch_maml_omniglot,0
+hf_Albert,0
+hf_Bart,11
+hf_Bert,0
+hf_DistilBert,0
+hf_GPT2,0
+hf_Reformer,5
+hf_T5_large,0
+lennard_jones,0
+maml_omniglot,0
+mnasnet1_0,0
+mobilenet_v2,0
+mobilenet_v3_large,0
+nvidia_deeprecommender,0
+opacus_cifar10,0
+pyhpc_isoneutral_mixing,0
+pytorch_CycleGAN_and_pix2pix,0
+pytorch_stargan,0
+pytorch_unet,0
+resnet152,0
+resnet18,0
+resnet50,0
+resnext50_32x4d,0
+shufflenet_v2_x1_0,0
+soft_actor_critic,0
+speech_transformer,9
+squeezenet1_1,0
+timm_efficientnet,0
+timm_regnet,0
+timm_resnest,0
+timm_vision_transformer,0
+timm_vision_transformer_large,0
+timm_vovnet,0
+tts_angular,1
+vgg16,0
+yolov3,1
diff --git a/benchmarks/dynamo/ci_expected_accuracy/training_huggingface.csv b/benchmarks/dynamo/ci_expected_accuracy/training_huggingface.csv
@@ -0,0 +1,37 @@
+name,graph_breaks
+AlbertForMaskedLM,7
+AlbertForQuestionAnswering,7
+BartForCausalLM,15
+BertForMaskedLM,7
+BertForQuestionAnswering,7
+BlenderbotSmallForCausalLM,15
+BlenderbotSmallForConditionalGeneration,7
+CamemBert,7
+DebertaForMaskedLM,55
+DebertaForQuestionAnswering,55
+DebertaV2ForMaskedLM,0
+DistilBertForMaskedLM,7
+DistilBertForQuestionAnswering,7
+DistillGPT2,7
+ElectraForCausalLM,11
+ElectraForQuestionAnswering,7
+GPT2ForSequenceClassification,9
+LayoutLMForMaskedLM,7
+LayoutLMForSequenceClassification,9
+MBartForCausalLM,15
+MegatronBertForCausalLM,7
+MegatronBertForQuestionAnswering,7
+MobileBertForMaskedLM,4
+MobileBertForQuestionAnswering,4
+PLBartForCausalLM,15
+PLBartForConditionalGeneration,18
+PegasusForCausalLM,16
+PegasusForConditionalGeneration,16
+RobertaForCausalLM,7
+RobertaForQuestionAnswering,7
+Speech2Text2ForCausalLM,16
+T5ForConditionalGeneration,7
+T5Small,7
+TrOCRForCausalLM,15
+XLNetLMHeadModel,7
+YituTechConvBert,11
diff --git a/benchmarks/dynamo/ci_expected_accuracy/training_timm_models0.csv b/benchmarks/dynamo/ci_expected_accuracy/training_timm_models0.csv
@@ -0,0 +1,25 @@
+name,graph_breaks
+adv_inception_v3,7
+beit_base_patch16_224,7
+coat_lite_mini,7
+convmixer_768_32,4
+convnext_base,7
+crossvit_9_240,7
+cspdarknet53,9
+deit_base_distilled_patch16_224,7
+dla102,7
+dm_nfnet_f0,7
+dpn107,9
+eca_botnext26ts_256,9
+ese_vovnet19b_dw,9
+fbnetc_100,9
+gernet_l,9
+ghostnet_100,9
+gluon_inception_v3,7
+gmixer_24_224,7
+gmlp_s16_224,7
+hrnet_w18,4
+inception_v3,7
+jx_nest_base,7
+lcnet_050,9
+mixer_b16_224,7
diff --git a/benchmarks/dynamo/ci_expected_accuracy/training_timm_models1.csv b/benchmarks/dynamo/ci_expected_accuracy/training_timm_models1.csv
@@ -0,0 +1,30 @@
+name,graph_breaks
+mixnet_l,9
+mnasnet_100,9
+mobilenetv2_100,9
+mobilenetv3_large_100,9
+mobilevit_s,9
+nfnet_l0,7
+pit_b_224,7
+pnasnet5large,6
+poolformer_m36,7
+regnety_002,9
+repvgg_a2,9
+res2net101_26w_4s,7
+res2net50_14w_8s,7
+res2next50,7
+resmlp_12_224,7
+resnest101e,7
+rexnet_100,9
+selecsls42b,7
+spnasnet_100,9
+swin_base_patch4_window7_224,7
+swsl_resnext101_32x16d,7
+tf_efficientnet_b0,9
+tf_mixnet_l,9
+tinynet_a,9
+tnt_s_patch16_224,7
+twins_pcpvt_base,7
+visformer_small,7
+vit_base_patch16_224,7
+volo_d1_224,7