From b51bee6b03316b898dc34775f82b519dfab539ca Mon Sep 17 00:00:00 2001 From: Andrea Valassi Date: Thu, 13 May 2021 10:42:24 +0200 Subject: [PATCH] [warps] add also per-second divergence metrics and add a few comments Note that I tried the 'stalled_barrier' metrics an dthey do not seem interesting On itscrd70.cern.ch (V100S-PCIE-32GB): ========================================================================= Process = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221] FP precision = DOUBLE (NaN/abnormal=0, zero=0) EvtsPerSec[MatrixElems] (3) = ( 5.711994e+08 ) sec^-1 MeanMatrixElemValue = ( 1.371706e-02 +- 3.270315e-06 ) GeV^0 TOTAL : 0.745683 sec 2,603,540,638 cycles # 2.655 GHz 3,537,849,260 instructions # 1.36 insn per cycle 1.049477458 seconds time elapsed ==PROF== Profiling "sigmaKin": launch__registers_per_thread 128 ==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 96.33% : smsp__sass_branch_targets.sum 109 4.18/usecond : smsp__sass_branch_targets_threads_uniform.sum 105 4.03/usecond : smsp__sass_branch_targets_threads_divergent.sum 4 153.37/msecond : smsp__warps_launched.sum 1 ========================================================================= --- .../SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc | 6 +++--- .../P1_Sigma_sm_epem_mupmum/throughput12.sh | 10 ++++++++-- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 7afdb65b00..70ff4db748 100644 --- a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -17,8 +17,8 @@ #include "CPPProcess.h" // Test ncu metrics for CUDA thread divergence -#undef MGONGPU_TEST_DIVERGENCE -//#define MGONGPU_TEST_DIVERGENCE 1 +//#undef MGONGPU_TEST_DIVERGENCE +#define MGONGPU_TEST_DIVERGENCE 1 //========================================================================== // Class member functions for calculating the matrix elements for @@ -116,7 +116,7 @@ namespace Proc opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 ); //oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower) #else - if ( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 ==0 ) + if ( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 ) opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 ); else oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower) diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh index 92441a0145..1e43574968 100755 --- a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh +++ b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh @@ -158,13 +158,19 @@ function runNcu() { set +x } -# Profile divergenece metrics more in detail +# Profile divergence metrics more in detail +# See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/branchstatistics.htm +# See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/sourcelevel/divergentbranch.htm function runNcuDiv() { exe=$1 args="-p 1 32 1" ###echo "runNcuDiv $exe $args OMP=$OMP_NUM_THREADS" if [ "${verbose}" == "1" ]; then set -x; fi - $(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' + ###$(which ncu) --query-metrics $exe $args + ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args + ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args + ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' + $(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}' set +x }