From b62101cb99ca16d83a646015679c30fcb152b76d Mon Sep 17 00:00:00 2001 From: MichaelSt98 Date: Tue, 16 Jan 2024 17:11:59 +0200 Subject: [PATCH 1/2] adding col/s as metric for CUDA variants --- src/cloudsc_cuda/cloudsc/cloudsc_driver.cu | 18 +++++++++++------- .../cloudsc/cloudsc_driver_hoist.cu | 18 +++++++++++------- 2 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu b/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu index d715aa68..5ac91de2 100644 --- a/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu +++ b/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu @@ -456,9 +456,9 @@ void cloudsc_driver(int numthreads, int numcols, int nproma) { double t2 = omp_get_wtime(); printf(" NUMOMP=%d, NGPTOT=%d, NPROMA=%d, NGPBLKS=%d\n", numthreads, numcols, nproma, nblocks); - printf(" %+10s%+10s%+10s%+10s%+10s %+4s : %+10s%+10s\n", - "NUMOMP", "NGPTOT", "#GP-cols", "#BLKS", "NPROMA", "tid#", "Time(msec)", "MFlops/s"); - double zfrac, zmflops; + printf(" %+10s%+10s%+10s%+10s%+10s %+4s : %+10s%+10s%+10s\n", + "NUMOMP", "NGPTOT", "#GP-cols", "#BLKS", "NPROMA", "tid#", "Time(msec)", "MFlops/s", "col/s"); + double zfrac, zmflops, zthrput; for (int t = 0; t < numthreads; t++) { const double tloc = zinfo[0][t]; const int coreid = (int) zinfo[1][t]; @@ -467,21 +467,25 @@ void cloudsc_driver(int numthreads, int numcols, int nproma) { zfrac = (double)igpc / (double)numcols; if (tloc > 0.0) { zmflops = 1.0e-06 * zfrac * zhpm * ((double)numcols / 100.) / tloc; + zthrput = (double)numcols/tloc; } else { zmflops = 0.; + zthrput = 0.0; } - printf(" %10d%10d%10d%10d%10d %4d : %10d%10d @ core#\n", - numthreads, numcols, igpc, icalls, nproma, t, (int)(tloc * 1000.), (int)zmflops); + printf(" %10d%10d%10d%10d%10d %4d : %10d%10d%10d @ core#\n", + numthreads, numcols, igpc, icalls, nproma, t, (int)(tloc * 1000.), (int)zmflops, (int)zthrput); } double tdiff = t2 - t1; zfrac = 1.0; if (tdiff > 0.0) { zmflops = 1.0e-06 * zfrac * zhpm * ((double)numcols / 100.) / tdiff; + zthrput = (double)numcols/tdiff; } else { zmflops = 0.0; + zthrput = 0.0; } - printf(" %10d%10d%10d%10d%10d %4d : %10d%10d TOTAL\n", - numthreads, numcols, numcols, nblocks, nproma, -1, (int)(tdiff * 1000.), (int)zmflops); + printf(" %10d%10d%10d%10d%10d %4d: %10d%10d%10d TOTAL\n", + numthreads, numcols, numcols, nblocks, nproma, -1, (int)(tdiff * 1000.), (int)zmflops, (int)zthrput); cloudsc_validate(klon, nlev, nclv, numcols, nproma, plude, pcovptot, prainfrac_toprfz, pfsqlf, pfsqif, diff --git a/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu b/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu index 9d7d615e..615cdf25 100644 --- a/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu +++ b/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu @@ -497,9 +497,9 @@ void cloudsc_driver(int numthreads, int numcols, int nproma) { double t2 = omp_get_wtime(); printf(" NUMOMP=%d, NGPTOT=%d, NPROMA=%d, NGPBLKS=%d\n", numthreads, numcols, nproma, nblocks); - printf(" %+10s%+10s%+10s%+10s%+10s %+4s : %+10s%+10s\n", - "NUMOMP", "NGPTOT", "#GP-cols", "#BLKS", "NPROMA", "tid#", "Time(msec)", "MFlops/s"); - double zfrac, zmflops; + printf(" %+10s%+10s%+10s%+10s%+10s %+4s : %+10s%+10s%+10s\n", + "NUMOMP", "NGPTOT", "#GP-cols", "#BLKS", "NPROMA", "tid#", "Time(msec)", "MFlops/s", "col/s"); + double zfrac, zmflops, zthrput; for (int t = 0; t < numthreads; t++) { const double tloc = zinfo[0][t]; const int coreid = (int) zinfo[1][t]; @@ -508,21 +508,25 @@ void cloudsc_driver(int numthreads, int numcols, int nproma) { zfrac = (double)igpc / (double)numcols; if (tloc > 0.0) { zmflops = 1.0e-06 * zfrac * zhpm * ((double)numcols / 100.) / tloc; + zthrput = (double)numcols/tloc; } else { zmflops = 0.; + zthrput = 0.0; } - printf(" %10d%10d%10d%10d%10d %4d : %10d%10d @ core#\n", - numthreads, numcols, igpc, icalls, nproma, t, (int)(tloc * 1000.), (int)zmflops); + printf(" %10d%10d%10d%10d%10d %4d : %10d%10d%10d @ core#\n", + numthreads, numcols, igpc, icalls, nproma, t, (int)(tloc * 1000.), (int)zmflops, (int)zthrput); } double tdiff = t2 - t1; zfrac = 1.0; if (tdiff > 0.0) { zmflops = 1.0e-06 * zfrac * zhpm * ((double)numcols / 100.) / tdiff; + zthrput = (double)numcols/tdiff; } else { zmflops = 0.0; + zthrput = 0.0; } - printf(" %10d%10d%10d%10d%10d %4d : %10d%10d TOTAL\n", - numthreads, numcols, numcols, nblocks, nproma, -1, (int)(tdiff * 1000.), (int)zmflops); + printf(" %10d%10d%10d%10d%10d %4d: %10d%10d%10d TOTAL\n", + numthreads, numcols, numcols, nblocks, nproma, -1, (int)(tdiff * 1000.), (int)zmflops, (int)zthrput); cloudsc_validate(klon, nlev, nclv, numcols, nproma, plude, pcovptot, prainfrac_toprfz, pfsqlf, pfsqif, From 995cadb190fe0a2ed1f137d978cfcd05ad9bc611 Mon Sep 17 00:00:00 2001 From: Michael Staneker Date: Wed, 7 Feb 2024 08:44:11 +0000 Subject: [PATCH 2/2] beautifying: fix alignment of the printed table for CUDA variants --- src/cloudsc_cuda/cloudsc/cloudsc_driver.cu | 2 +- src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu b/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu index 5ac91de2..96d04519 100644 --- a/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu +++ b/src/cloudsc_cuda/cloudsc/cloudsc_driver.cu @@ -484,7 +484,7 @@ void cloudsc_driver(int numthreads, int numcols, int nproma) { zmflops = 0.0; zthrput = 0.0; } - printf(" %10d%10d%10d%10d%10d %4d: %10d%10d%10d TOTAL\n", + printf(" %10d%10d%10d%10d%10d %4d : %10d%10d%10d TOTAL\n", numthreads, numcols, numcols, nblocks, nproma, -1, (int)(tdiff * 1000.), (int)zmflops, (int)zthrput); cloudsc_validate(klon, nlev, nclv, numcols, nproma, diff --git a/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu b/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu index 615cdf25..152abb1e 100644 --- a/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu +++ b/src/cloudsc_cuda/cloudsc/cloudsc_driver_hoist.cu @@ -525,7 +525,7 @@ void cloudsc_driver(int numthreads, int numcols, int nproma) { zmflops = 0.0; zthrput = 0.0; } - printf(" %10d%10d%10d%10d%10d %4d: %10d%10d%10d TOTAL\n", + printf(" %10d%10d%10d%10d%10d %4d : %10d%10d%10d TOTAL\n", numthreads, numcols, numcols, nblocks, nproma, -1, (int)(tdiff * 1000.), (int)zmflops, (int)zthrput); cloudsc_validate(klon, nlev, nclv, numcols, nproma,