more scripts

jahatef · Nov 20, 2023 · 7037fc6 · 7037fc6
1 parent 4065583
commit 7037fc6
Show file tree

Hide file tree

Showing 2,377 changed files with 31,744 additions and 152 deletions.
diff --git a/__pycache__/megatron_wrapper.cpython-39.pyc b/__pycache__/megatron_wrapper.cpython-39.pyc
diff --git a/benchmarks_50346.out b/benchmarks_50346.out
@@ -0,0 +1,198 @@
+1.13.1 
+
+[2023-11-16 17:46:38,010] [INFO] [distributed.py:36:init_distributed] Not using the DeepSpeed or torch.distributed launchers, attempting to detect MPI environment...
+[2023-11-16 17:46:38,911] [INFO] [distributed.py:83:mpi_discovery] Discovered MPI settings of world_rank=0, local_rank=0, world_size=1, master_addr=26.0.148.159, master_port=6000
+[2023-11-16 17:46:38,911] [INFO] [distributed.py:46:init_distributed] Initializing torch distributed with backend: nccl
+[2023-11-16 17:46:41,931] [INFO] [checkpointing.py:223:model_parallel_cuda_manual_seed] > initializing model parallel cuda seeds on global rank 0, model parallel rank 0, and data parallel rank 0 with model parallel seed: 3952 and data parallel seed: 1234
+num_attention_heads: 128, hidden_size: 1024, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0081
+Attention throughput (in TFLOP/s): 17.027
+MLP duration (in seconds): 0.0008
+MLP throughput (in TFLOP/s): 182.309
+Transformer duration (in seconds): 0.0119
+Transformer throughput (in TFLOP/s): 23.121
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 2048, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0113
+Attention throughput (in TFLOP/s): 36.332
+MLP duration (in seconds): 0.0026
+MLP throughput (in TFLOP/s): 213.031
+Transformer duration (in seconds): 0.0137
+Transformer throughput (in TFLOP/s): 70.045
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 3072, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0116
+Attention throughput (in TFLOP/s): 70.844
+MLP duration (in seconds): 0.0058
+MLP throughput (in TFLOP/s): 214.599
+Transformer duration (in seconds): 0.0190
+Transformer throughput (in TFLOP/s): 108.363
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 4096, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0200
+Attention throughput (in TFLOP/s): 68.625
+MLP duration (in seconds): 0.0096
+MLP throughput (in TFLOP/s): 228.121
+Transformer duration (in seconds): 0.0317
+Transformer throughput (in TFLOP/s): 112.557
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 5120, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0277
+Attention throughput (in TFLOP/s): 74.417
+MLP duration (in seconds): 0.0141
+MLP throughput (in TFLOP/s): 243.298
+Transformer duration (in seconds): 0.0424
+Transformer throughput (in TFLOP/s): 129.550
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 6144, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0317
+Attention throughput (in TFLOP/s): 91.151
+MLP duration (in seconds): 0.0202
+MLP throughput (in TFLOP/s): 244.987
+Transformer duration (in seconds): 0.0474
+Transformer throughput (in TFLOP/s): 165.404
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 7168, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0376
+Attention throughput (in TFLOP/s): 102.440
+MLP duration (in seconds): 0.0272
+MLP throughput (in TFLOP/s): 247.391
+Transformer duration (in seconds): 0.0532
+Transformer throughput (in TFLOP/s): 198.935
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 8192, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0418
+Attention throughput (in TFLOP/s): 118.484
+MLP duration (in seconds): 0.0361
+MLP throughput (in TFLOP/s): 243.402
+Transformer duration (in seconds): 0.0680
+Transformer throughput (in TFLOP/s): 202.184
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 9216, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0474
+Attention throughput (in TFLOP/s): 130.496
+MLP duration (in seconds): 0.0458
+MLP throughput (in TFLOP/s): 243.321
+Transformer duration (in seconds): 0.0861
+Transformer throughput (in TFLOP/s): 201.131
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 10240, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0471
+Attention throughput (in TFLOP/s): 160.579
+MLP duration (in seconds): 0.0555
+MLP throughput (in TFLOP/s): 247.716
+Transformer duration (in seconds): 0.1013
+Transformer throughput (in TFLOP/s): 210.391
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 11264, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0524
+Attention throughput (in TFLOP/s): 173.033
+MLP duration (in seconds): 0.0672
+MLP throughput (in TFLOP/s): 247.515
+Transformer duration (in seconds): 0.1203
+Transformer throughput (in TFLOP/s): 213.573
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 12288, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0600
+Attention throughput (in TFLOP/s): 178.688
+MLP duration (in seconds): 0.0788
+MLP throughput (in TFLOP/s): 251.122
+Transformer duration (in seconds): 0.1387
+Transformer throughput (in TFLOP/s): 220.040
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 13312, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0603
+Attention throughput (in TFLOP/s): 207.420
+MLP duration (in seconds): 0.0926
+MLP throughput (in TFLOP/s): 250.856
+Transformer duration (in seconds): 0.1587
+Transformer throughput (in TFLOP/s): 225.228
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 14336, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0705
+Attention throughput (in TFLOP/s): 204.652
+MLP duration (in seconds): 0.1079
+MLP throughput (in TFLOP/s): 249.742
+Transformer duration (in seconds): 0.1833
+Transformer throughput (in TFLOP/s): 225.752
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 15360, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0800
+Attention throughput (in TFLOP/s): 206.148
+MLP duration (in seconds): 0.1236
+MLP throughput (in TFLOP/s): 250.092
+Transformer duration (in seconds): 0.2085
+Transformer throughput (in TFLOP/s): 227.465
+========================================================================================================================
+num_attention_heads: 128, hidden_size: 16384, train_micro_batch_size_per_gpu: 4, tensor_mp_size: 1, pipeline_mp_size: 1, dp_size: 1
+
+
+Actual
+------
+Attention duration (in seconds): 0.0847
+Attention throughput (in TFLOP/s): 220.792
+MLP duration (in seconds): 0.1413
+MLP throughput (in TFLOP/s): 249.029
+Transformer duration (in seconds): 0.2326
+Transformer throughput (in TFLOP/s): 231.596
+========================================================================================================================