Skip to content

Commit

Permalink
[warps] add also per-second divergence metrics and add a few comments
Browse files Browse the repository at this point in the history
Note that I tried the 'stalled_barrier' metrics an dthey do not seem interesting

On itscrd70.cern.ch (V100S-PCIE-32GB):
=========================================================================
Process                     = EPOCH1_EEMUMU_CUDA [nvcc 11.0.221]
FP precision                = DOUBLE (NaN/abnormal=0, zero=0)
EvtsPerSec[MatrixElems] (3) = ( 5.711994e+08                 )  sec^-1
MeanMatrixElemValue         = ( 1.371706e-02 +- 3.270315e-06 )  GeV^0
TOTAL       :     0.745683 sec
     2,603,540,638      cycles                    #    2.655 GHz
     3,537,849,260      instructions              #    1.36  insn per cycle
       1.049477458 seconds time elapsed
==PROF== Profiling "sigmaKin": launch__registers_per_thread 128
==PROF== Profiling "sigmaKin": sm__sass_average_branch_targets_threads_uniform.pct 96.33%
                             : smsp__sass_branch_targets.sum                       109        4.18/usecond
                             : smsp__sass_branch_targets_threads_uniform.sum       105        4.03/usecond
                             : smsp__sass_branch_targets_threads_divergent.sum     4          153.37/msecond
                             : smsp__warps_launched.sum                            1
=========================================================================
  • Loading branch information
valassi committed May 13, 2021
1 parent 8514f13 commit b51bee6
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
#include "CPPProcess.h"

// Test ncu metrics for CUDA thread divergence
#undef MGONGPU_TEST_DIVERGENCE
//#define MGONGPU_TEST_DIVERGENCE 1
//#undef MGONGPU_TEST_DIVERGENCE
#define MGONGPU_TEST_DIVERGENCE 1

//==========================================================================
// Class member functions for calculating the matrix elements for
Expand Down Expand Up @@ -116,7 +116,7 @@ namespace Proc
opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 );
//oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower)
#else
if ( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 ==0 )
if ( ( blockDim.x * blockIdx.x + threadIdx.x ) % 2 == 0 )
opzxxx( allmomenta, cHel[ihel][0], -1, w_sv[0], 0 );
else
oxxxxx( allmomenta, 0, cHel[ihel][0], -1, w_sv[0], 0 ); // tested ok (much slower)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,19 @@ function runNcu() {
set +x
}

# Profile divergenece metrics more in detail
# Profile divergence metrics more in detail
# See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/branchstatistics.htm
# See https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/sourcelevel/divergentbranch.htm
function runNcuDiv() {
exe=$1
args="-p 1 32 1"
###echo "runNcuDiv $exe $args OMP=$OMP_NUM_THREADS"
if [ "${verbose}" == "1" ]; then set -x; fi
$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}'
###$(which ncu) --query-metrics $exe $args
###$(which ncu) --metrics regex:.*branch_targets.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args
###$(which ncu) --metrics regex:.*stalled_barrier.* --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args
###$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %s\n", "", $18, $19; printf "%29s: %-51s %s\n", "", $22, $23; printf "%29s: %-51s %s\n", "", $20, $21; printf "%29s: %-51s %s\n", "", $24, $26}'
$(which ncu) --metrics sm__sass_average_branch_targets_threads_uniform.pct,smsp__warps_launched.sum,smsp__sass_branch_targets.sum,smsp__sass_branch_targets_threads_divergent.sum,smsp__sass_branch_targets_threads_uniform.sum,smsp__sass_branch_targets.sum.per_second,smsp__sass_branch_targets_threads_divergent.sum.per_second,smsp__sass_branch_targets_threads_uniform.sum.per_second --target-processes all --kernel-id "::sigmaKin:" --print-kernel-base mangled $exe $args | egrep '(sigmaKin| sm)' | tr "\n" " " | awk '{printf "%29s: %-51s %-10s %s\n", "", $18, $19, $22$21; printf "%29s: %-51s %-10s %s\n", "", $28, $29, $32$31; printf "%29s: %-51s %-10s %s\n", "", $23, $24, $27$26; printf "%29s: %-51s %s\n", "", $33, $35}'
set +x
}

Expand Down

0 comments on commit b51bee6

Please sign in to comment.