Merge pull request #498 from GATEOverflow/mlperf-inference

Sync from GO
mlcommons · Nov 7, 2024 · 2231aea · 2231aea
2 parents 73a9499 + 2164e66
commit 2231aea
Show file tree

Hide file tree

Showing 7 changed files with 45 additions and 10 deletions.
diff --git a/.github/workflows/test-amd-mlperf-inference-implementations.yml b/.github/workflows/test-amd-mlperf-inference-implementations.yml
@@ -0,0 +1,26 @@
+name: MLPerf Inference AMD implementations
+
+on:
+  schedule:
+    - cron: "29 4 * * *" #to be adjusted
+
+jobs:
+  build_nvidia:
+      if: github.repository_owner == 'gateoverflow'
+      runs-on: [ self-hosted, linux, x64, GO-spr ]
+      strategy:
+        fail-fast: false
+        matrix:
+          python-version: [ "3.12" ]
+          model: [ "llama2-70b" ]
+      steps:
+      - name: Test MLPerf Inference AMD (build only) ${{ matrix.model }}
+        run: |
+          if [ -f "gh_action_conda/bin/deactivate" ]; then source gh_action_conda/bin/deactivate; fi
+          python3 -m venv gh_action_conda
+          source gh_action_conda/bin/activate
+          export CM_REPOS=$HOME/GH_CM
+          pip install --upgrade cm4mlops
+          pip install tabulate
+          cm run script --tags=run-mlperf,inference,_all-scenarios,_full,_r4.1-dev  --execution_mode=valid  --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=IntelSPR.24c  --implementation=amd    --backend=pytorch    --category=datacenter --division=open --scenario=Offline  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=rocm  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
+          # cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from GH action on SPR.24c" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=IntelSPR.24c
diff --git a/...workflows/test-mlperf-inference-intel.yml → ...ntel-mlperf-inference-implementations.yml b/...workflows/test-mlperf-inference-intel.yml → ...ntel-mlperf-inference-implementations.yml
@@ -2,7 +2,7 @@ name: MLPerf Inference Intel implementations
 
 on:
   schedule:
-    - cron: "29 16 * * *" #to be adjusted
+    - cron: "29 1 * * *" #to be adjusted
 
 jobs:
   build_nvidia:
@@ -21,5 +21,6 @@ jobs:
           source gh_action_conda/bin/activate
           export CM_REPOS=$HOME/GH_CM
           pip install --upgrade cm4mlops
+          pip install tabulate
           cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --execution_mode=valid  --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=IntelSPR.24c  --implementation=intel    --backend=pytorch    --category=datacenter --division=open --scenario=Offline  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cpu  --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
           cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from GH action on SPR.24c" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=IntelSPR.24c
diff --git a/.github/workflows/test-mlperf-inference-llama2.yml b/.github/workflows/test-mlperf-inference-llama2.yml
@@ -5,12 +5,12 @@ name: MLPerf inference LLAMA 2 70B
 
 on:
   schedule:
-    - cron: "30 19 * * 4"
+    - cron: "30 2 * * 4"
 
 jobs:
   build_reference:
     if: github.repository_owner == 'gateoverflow'
-    runs-on: [ self-hosted, GO-i9, linux, x64 ]
+    runs-on: [ self-hosted, GO-spr, linux, x64 ]
     strategy:
       fail-fast: false
       matrix:
@@ -24,9 +24,10 @@ jobs:
         source gh_action/bin/deactivate || python3 -m venv gh_action
         source gh_action/bin/activate
         export CM_REPOS=$HOME/GH_CM
-        python3 -m pip install cm4mlops
+        pip install cm4mlops
+        pip install tabulate 
         cm pull repo
-        python3 -m pip install "huggingface_hub[cli]"
+        pip install "huggingface_hub[cli]"
         huggingface-cli login --token ${{ secrets.HF_TOKEN }} --add-to-git-credential
     - name: Test MLPerf Inference LLAMA 2 70B reference implementation
       run: |

diff --git a/.github/workflows/test-mlperf-inference-mixtral.yml b/.github/workflows/test-mlperf-inference-mixtral.yml
@@ -5,12 +5,12 @@ name: MLPerf inference MIXTRAL-8x7B
 
 on:
   schedule:
-    - cron: "30 20 * * *"   # 30th minute and 20th hour => 20:30 UTC => 2 AM IST 
+    - cron: "30 03 * * *"   # 30th minute and 20th hour => 20:30 UTC => 2 AM IST 
 
 jobs:
   build_reference:
     if: github.repository_owner == 'gateoverflow'
-    runs-on: [ self-hosted, GO-i9, linux, x64 ]
+    runs-on: [ self-hosted, GO-spr, linux, x64 ]
     strategy:
       fail-fast: false
       matrix:

diff --git a/...ows/test-nvidia-mlperf-implementation.yml → ...idia-mlperf-inference-implementations.yml b/...ows/test-nvidia-mlperf-implementation.yml → ...idia-mlperf-inference-implementations.yml
@@ -21,5 +21,6 @@ jobs:
           source gh_action/bin/activate
           export CM_REPOS=$HOME/GH_CM
           pip install --upgrade cm4mlops
+          pip install tabulate
           cm run script --tags=run-mlperf,inference,_all-scenarios,_submission,_full,_r4.1-dev --preprocess_submission=yes --execution_mode=valid --gpu_name=rtx_4090 --pull_changes=yes --pull_inference_changes=yes --model=${{ matrix.model }} --submitter="MLCommons" --hw_name=RTX4090x2  --implementation=nvidia    --backend=tensorrt    --category=datacenter,edge --division=closed  --docker_dt=yes --docker_it=no --docker_cm_repo=gateoverflow@cm4mlops --adr.compiler.tags=gcc --device=cuda --use_dataset_from_host=yes --results_dir=$HOME/gh_action_results --submission_dir=$HOME/gh_action_submissions --clean  --docker --quiet
           cm run script --tags=push,github,mlperf,inference,submission --repo_url=https://github.com/gateoverflow/mlperf_inference_unofficial_submissions_v5.0 --repo_branch=main --commit_message="Results from GH action on NVIDIA_RTX4090x2" --quiet --submission_dir=$HOME/gh_action_submissions --hw_name=RTX4090x2
diff --git a/.github/workflows/test-scc24-sdxl.yaml b/.github/workflows/test-scc24-sdxl.yaml
@@ -2,7 +2,7 @@ name: MLPerf inference SDXL (SCC)
 
 on:
   schedule:
-    - cron: "20 14 * * *"
+    - cron: "35 19 * * *"
 
 jobs:
   build_reference:

diff --git a/script/get-git-repo/run.sh b/script/get-git-repo/run.sh
@@ -6,7 +6,9 @@ SCRIPT_DIR=${CM_TMP_CURRENT_SCRIPT_PATH}
 
 folder=${CM_GIT_CHECKOUT_FOLDER}
 if [ ! -e "${CM_TMP_GIT_PATH}" ]; then
-  rm -rf ${folder}
+  cmd="rm -rf ${folder}"
+  echo $cmd
+  eval $cmd
   echo "******************************************************"
   echo "Current directory: ${CUR_DIR}"
   echo ""
@@ -16,7 +18,11 @@ if [ ! -e "${CM_TMP_GIT_PATH}" ]; then
   echo ""
 
   ${CM_GIT_CLONE_CMD}
-  test $? -eq 0 || exit $?
+  if [ ! $rcode -eq 0 ]; then #try once more
+    rm -rf $folder
+    ${CM_GIT_CLONE_CMD}
+    test $? -eq 0 || exit $?
+  fi
 
   cd ${folder}