diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc index 7afdb65b00..70ff4db748 100644 --- a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc +++ b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/CPPProcess.cc @@ -17,8 +17,8 @@ #include "CPPProcess.h" // Test ncu metrics for CUDA thread divergence -#undef MGONGPU_TEST_DIVERGENCE -//#define MGONGPU_TEST_DIVERGENCE 1 +//#undef MGONGPU_TEST_DIVERGENCE +#define MGONGPU_TEST_DIVERGENCE 1 //========================================================================== // Class member functions for calculating the matrix elements for @@ -116,7 +116,7 @@ namespace Proc opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 ); //oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower) #else - if ( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 ==0 ) + if ( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 ) opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 ); else oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower) diff --git a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh index 92441a0145..1e43574968 100755 --- a/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh +++ b/epoch1/cuda/ee_mumu/SubProcesses/P1_Sigma_sm_epem_mupmum/throughput12.sh @@ -158,13 +158,19 @@ function runNcu() { set +x } -# Profile divergenece metrics more in detail +# Profile divergence metrics more in detail +# See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/branchstatistics.htm +# See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/sourcelevel/divergentbranch.htm function runNcuDiv() { exe=$1 args="-p 1 32 1" ###echo "runNcuDiv $exe $args OMP=$OMP_NUM_THREADS" if [ "${verbose}" == "1" ]; then set -x; fi - $(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' + ###$(which ncu) --query-metrics $exe $args + ###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args + ###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args + ###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}' + $(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}' set +x }