Skip to content

Commit

Permalink
[klas5X] COMPLETE the port of the latest klas5 in epoch1/eemumu to kl…
Browse files Browse the repository at this point in the history
…as5X in epochX/eemumu

Ported everything as-is (in one single commit), including stuff that eventually needs to be cleaned up
(e.g. it will probably be better to only support some given choices of neppM and neppV and fail otherwise)

This essentially completes (except for backports) issue madgraph5#176 about removing neppV from the public API...

Performance (see next commit) is almost the same.
Probably 512y eemumu loses 1% because I have removed a few vectorizations on allMEs.
Maybe I can get back a tiny bit even there.
  • Loading branch information
valassi committed Nov 30, 2021
1 parent 50c387e commit 98008b4
Show file tree
Hide file tree
Showing 12 changed files with 331 additions and 160 deletions.
19 changes: 16 additions & 3 deletions epochX/cudacpp/ee_mumu/SubProcesses/Memory.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
/*
* memory.h
* Memory.h
*
* Created on: 19.11.2020
* Author: shageboeck
* Modified: avalassi (vectors, alignment)
*/

#ifndef MEMORY_H
Expand Down Expand Up @@ -46,13 +47,25 @@ std::unique_ptr<T[], CudaHstDeleter<T>> hstMakeUnique(std::size_t N) {

#else

template<typename T = fptype>
struct CppHstDeleter {
void operator()(T* mem) {
::operator delete( mem, std::align_val_t{ mgOnGpu::cppAlign } );
}
};

template<typename T = fptype> inline
std::unique_ptr<T[]> hstMakeUnique(std::size_t N) { return std::unique_ptr<T[]>{ new T[N]() }; };
std::unique_ptr<T[], CppHstDeleter<T>> hstMakeUnique(std::size_t N) {
// See https://www.bfilipek.com/2019/08/newnew-align.html#requesting-different-alignment
return std::unique_ptr<T[], CppHstDeleter<T>>{ new( std::align_val_t{ mgOnGpu::cppAlign } ) T[N]() };
};

#ifdef MGONGPU_CPPSIMD

template<> inline
std::unique_ptr<fptype_v[]> hstMakeUnique(std::size_t N) { return std::unique_ptr<fptype_v[]>{ new fptype_v[N/neppV]() }; };
std::unique_ptr<fptype_v[], CppHstDeleter<fptype_v>> hstMakeUnique(std::size_t N) {
return std::unique_ptr<fptype_v[], CppHstDeleter<fptype_v>>{ new( std::align_val_t{ mgOnGpu::cppAlign } ) fptype_v[N/neppV]() };
};

#endif

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,10 @@ namespace Proc
__device__
INLINE
void calculate_wavefunctions( int ihel,
const fptype_sv* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype_sv* allMEs // output: allMEs[npagM][neppM], final |M|^2 averaged over helicities
const fptype* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype* allMEs // output: allMEs[nevt], |M|^2 running_sum_over_helicities
#ifndef __CUDACC__
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
#endif
)
//ALWAYS_INLINE // attributes are not permitted in a function definition
Expand All @@ -99,14 +99,21 @@ namespace Proc
// === Calculate wavefunctions and amplitudes for all diagrams in all processes - Loop over nevt events ===
#ifndef __CUDACC__
const int npagV = nevt / neppV;
#ifdef MGONGPU_CPPSIMD
const bool isAligned_allMEs = ( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // require SIMD-friendly alignment by at least neppV*sizeof(fptype)
#endif
// ** START LOOP ON IPAGV **
#ifdef _OPENMP
// (NB gcc9 or higher, or clang, is required)
// - default(none): no variables are shared by default
// - shared: as the name says
// - private: give each thread its own copy, without initialising
// - firstprivate: give each thread its own copy, and initialise with value from outside
#ifdef MGONGPU_CPPSIMD
#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV,isAligned_allMEs) private (amp_sv,w_sv,jamp_sv)
#else
#pragma omp parallel for default(none) shared(allmomenta,allMEs,cHel,cIPC,cIPD,ihel,npagV) private (amp_sv,w_sv,jamp_sv)
#endif
#endif
for ( int ipagV = 0; ipagV < npagV; ++ipagV )
#endif
Expand Down Expand Up @@ -193,13 +200,27 @@ namespace Proc
#ifdef __CUDACC__
const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
allMEs[ievt] += deltaMEs;
//printf( "calculate_wavefunction: %6d %2d %f\n", ievt, ihel, allMEs[ievt] );
//if ( cNGoodHel > 0 ) printf( "calculate_wavefunction: %6d %2d %f\n", ievt, ihel, allMEs[ievt] );
#else
#ifdef MGONGPU_CPPSIMD
if ( isAligned_allMEs )
{
*reinterpret_cast<fptype_sv*>( &( allMEs[ipagV*neppV] ) ) += deltaMEs;
}
else
{
for ( int ieppV=0; ieppV<neppV; ieppV++ )
allMEs[ipagV*neppV + ieppV] += deltaMEs[ieppV];
}
//if ( cNGoodHel > 0 )
// for ( int ieppV=0; ieppV<neppV; ieppV++ )
// printf( "calculate_wavefunction: %6d %2d %f\n", ipagV*neppV+ieppV, ihel, allMEs[ipagV][ieppV] );
#else
allMEs[ipagV] += deltaMEs;
//printf( "calculate_wavefunction: %6d %2d %f\n", ipagV, ihel, allMEs[ipagV] ); // FIXME for MGONGPU_CPPSIMD
//if ( cNGoodHel > 0 ) printf( "calculate_wavefunction: %6d %2d %f\n", ipagV, ihel, allMEs[ipagV] );
#endif
#endif
}

mgDebug( 1, __FUNCTION__ );
return;
}
Expand Down Expand Up @@ -243,10 +264,13 @@ namespace Proc
memcpy( cHel, tHel, ncomb * mgOnGpu::npar * sizeof(short) );
#endif
// SANITY CHECK: GPU memory usage may be based on casts of fptype[2] to cxtype
assert( sizeof(cxtype) == 2 * sizeof(fptype) );
static_assert( sizeof(cxtype) == 2 * sizeof(fptype) );
#ifndef __CUDACC__
// SANITY CHECK: momenta AOSOA uses vectors with the same size as fptype_v
assert( neppV == mgOnGpu::neppM );
// SANITY CHECK: check that neppR, neppM and neppV are powers of two (https://stackoverflow.com/a/108360)
auto ispoweroftwo = []( int n ) { return ( n > 0 ) && !( n & ( n - 1 ) ); };
static_assert( ispoweroftwo( mgOnGpu::neppR ) );
static_assert( ispoweroftwo( mgOnGpu::neppM ) );
static_assert( ispoweroftwo( neppV ) );
#endif
}

Expand Down Expand Up @@ -356,9 +380,9 @@ namespace Proc

#ifdef __CUDACC__
__global__
void sigmaKin_getGoodHel( const fptype_sv* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype_sv* allMEs, // output: allMEs[npagM][neppM], final |M|^2 averaged over helicities
bool* isGoodHel ) // output: isGoodHel[ncomb] - device array
void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities
bool* isGoodHel ) // output: isGoodHel[ncomb] - device array
{
const int ievt = blockDim.x * blockIdx.x + threadIdx.x; // index of event (thread) in grid
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
Expand All @@ -376,33 +400,35 @@ namespace Proc
}
}
#else
void sigmaKin_getGoodHel( const fptype_sv* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype_sv* allMEs, // output: allMEs[npagM][neppM], final |M|^2 averaged over helicities
bool* isGoodHel // output: isGoodHel[ncomb] - device array
, const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities
bool* isGoodHel // output: isGoodHel[ncomb] - device array
, const int nevt ) // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
{
assert( (size_t)(allmomenta) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment
//assert( (size_t)(allMEs) % mgOnGpu::cppAlign == 0 ); // SANITY CHECK: require SIMD-friendly alignment
const int maxtry0 = ( neppV > 16 ? neppV : 16 ); // 16, but at least neppV (otherwise the npagV loop does not even start)
fptype_sv allMEsLast[maxtry0/neppV] = { 0 };
fptype allMEsLast[maxtry0] = { 0 };
const int maxtry = std::min( maxtry0, nevt ); // 16, but at most nevt (avoid invalid memory access if nevt<maxtry0)
for ( int ipagV = 0; ipagV < maxtry/neppV; ++ipagV )
for ( int ievt = 0; ievt < maxtry; ++ievt )
{
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
allMEs[ipagV] = fptype_sv{0}; // all zeros
allMEs[ievt] = 0; // all zeros
}
for ( int ihel = 0; ihel < ncomb; ihel++ )
{
//std::cout << "sigmaKin_getGoodHel ihel=" << ihel << ( isGoodHel[ihel] ? " true" : " false" ) << std::endl;
calculate_wavefunctions( ihel, allmomenta, allMEs, maxtry );
for ( int ipagV = 0; ipagV < maxtry/neppV; ++ipagV )
for ( int ievt = 0; ievt < maxtry; ++ievt )
{
// FIXME: assume process.nprocesses == 1 for the moment (eventually: need a loop over processes here?)
const bool differs = maskor( allMEs[ipagV] != allMEsLast[ipagV] ); // true if any of the neppV events differs
const bool differs = ( allMEs[ievt] != allMEsLast[ievt] );
if ( differs )
{
//if ( !isGoodHel[ihel] ) std::cout << "sigmaKin_getGoodHel ihel=" << ihel << " TRUE" << std::endl;
isGoodHel[ihel] = true;
}
allMEsLast[ipagV] = allMEs[ipagV]; // running sum up to helicity ihel
allMEsLast[ievt] = allMEs[ievt]; // running sum up to helicity ihel
}
}
}
Expand Down Expand Up @@ -439,10 +465,10 @@ namespace Proc
// FIXME: assume process.nprocesses == 1 (eventually: allMEs[nevt] -> allMEs[nevt*nprocesses]?)

__global__
void sigmaKin( const fptype_sv* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype_sv* allMEs // output: allMEs[npagM][neppM], final |M|^2 averaged over helicities
void sigmaKin( const fptype* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype* allMEs // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifndef __CUDACC__
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
#endif
)
{
Expand Down Expand Up @@ -472,7 +498,8 @@ namespace Proc
const int npagV = nevt/neppV;
for ( int ipagV = 0; ipagV < npagV; ++ipagV )
{
allMEs[ipagV] = fptype_sv{ 0 };
for ( int ieppV=0; ieppV<neppV; ieppV++ )
allMEs[ipagV*neppV + ieppV] = 0; // all zeros
}
#endif

Expand Down Expand Up @@ -500,7 +527,8 @@ namespace Proc
#else
for ( int ipagV = 0; ipagV < npagV; ++ipagV )
{
allMEs[ipagV] /= denominators;
for ( int ieppV=0; ieppV<neppV; ieppV++ )
allMEs[ipagV*neppV + ieppV] /= denominators;
}
#endif
mgDebugFinalise();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,11 @@ namespace Proc
//--------------------------------------------------------------------------

__global__
void sigmaKin_getGoodHel( const fptype_sv* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype_sv* allMEs, // output: allMEs[npagM][neppM], final |M|^2 averaged over helicities
bool* isGoodHel // output: isGoodHel[ncomb] - device array
void sigmaKin_getGoodHel( const fptype* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype* allMEs, // output: allMEs[nevt], |M|^2 final_avg_over_helicities
bool* isGoodHel // output: isGoodHel[ncomb] - device array
#ifndef __CUDACC__
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
#endif
);

Expand All @@ -128,10 +128,10 @@ namespace Proc
//--------------------------------------------------------------------------

__global__
void sigmaKin( const fptype_sv* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype_sv* allMEs // output: allMEs[npagM][neppM], final |M|^2 averaged over helicities
void sigmaKin( const fptype* allmomenta, // input: momenta as AOSOA[npagM][npar][4][neppM] with nevt=npagM*neppM
fptype* allMEs // output: allMEs[nevt], |M|^2 final_avg_over_helicities
#ifndef __CUDACC__
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
, const int nevt // input: #events (for cuda: nevt == ndim == gpublocks*gputhreads)
#endif
);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -316,19 +316,19 @@ int main(int argc, char **argv)
const int nMEs = nevt; // FIXME: assume process.nprocesses == 1 (eventually: nMEs = nevt * nprocesses?)

#if defined MGONGPU_CURAND_ONHOST or defined MGONGPU_COMMONRAND_ONHOST or not defined __CUDACC__
auto hstRnarray = hstMakeUnique<fptype >( nRnarray ); // AOSOA[npagR][nparf][np4][neppR] (NB: nevt=npagR*neppR)
auto hstRnarray = hstMakeUnique<fptype>( nRnarray ); // AOSOA[npagR][nparf][np4][neppR] (NB: nevt=npagR*neppR)
#endif
auto hstMomenta = hstMakeUnique<fptype_sv>( nMomenta ); // AOSOA[npagM][npar][np4][neppM] (NB: nevt=npagM*neppM)
auto hstIsGoodHel = hstMakeUnique<bool >( ncomb );
auto hstWeights = hstMakeUnique<fptype >( nWeights );
auto hstMEs = hstMakeUnique<fptype_sv>( nMEs ); // AOSOA[npagM][neppM] (NB: nevt=npagM*neppM)
auto hstMomenta = hstMakeUnique<fptype>( nMomenta ); // AOSOA[npagM][npar][np4][neppM] (NB: nevt=npagM*neppM)
auto hstIsGoodHel = hstMakeUnique<bool >( ncomb );
auto hstWeights = hstMakeUnique<fptype>( nWeights );
auto hstMEs = hstMakeUnique<fptype>( nMEs ); // ARRAY[nevt]

#ifdef __CUDACC__
auto devRnarray = devMakeUnique<fptype >( nRnarray ); // AOSOA[npagR][nparf][np4][neppR] (NB: nevt=npagR*neppR)
auto devMomenta = devMakeUnique<fptype >( nMomenta ); // AOSOA[npagM][npar][np4][neppM] (NB: nevt=npagM*neppM)
auto devIsGoodHel = devMakeUnique<bool >( ncomb );
auto devWeights = devMakeUnique<fptype >( nWeights );
auto devMEs = devMakeUnique<fptype >( nMEs ); // AOSOA[npagM][neppM] (NB: nevt=npagM*neppM)
auto devRnarray = devMakeUnique<fptype>( nRnarray ); // AOSOA[npagR][nparf][np4][neppR] (NB: nevt=npagR*neppR)
auto devMomenta = devMakeUnique<fptype>( nMomenta ); // AOSOA[npagM][npar][np4][neppM] (NB: nevt=npagM*neppM)
auto devIsGoodHel = devMakeUnique<bool >( ncomb );
auto devWeights = devMakeUnique<fptype>( nWeights );
auto devMEs = devMakeUnique<fptype>( nMEs ); // ARRAY[nevt]

#if defined MGONGPU_CURAND_ONHOST or defined MGONGPU_COMMONRAND_ONHOST
const int nbytesRnarray = nRnarray * sizeof(fptype);
Expand Down Expand Up @@ -553,37 +553,22 @@ int main(int argc, char **argv)
// NB: 'setw' affects only the next field (of any type)
std::cout << std::scientific // fixed format: affects all floats (default precision: 6)
<< std::setw(4) << ipar + 1
#ifndef MGONGPU_CPPSIMD
<< std::setw(14) << hstMomenta[ipagM*npar*np4*neppM + ipar*np4*neppM + 0*neppM + ieppM] // AOSOA[ipagM][ipar][0][ieppM]
<< std::setw(14) << hstMomenta[ipagM*npar*np4*neppM + ipar*np4*neppM + 1*neppM + ieppM] // AOSOA[ipagM][ipar][1][ieppM]
<< std::setw(14) << hstMomenta[ipagM*npar*np4*neppM + ipar*np4*neppM + 2*neppM + ieppM] // AOSOA[ipagM][ipar][2][ieppM]
<< std::setw(14) << hstMomenta[ipagM*npar*np4*neppM + ipar*np4*neppM + 3*neppM + ieppM] // AOSOA[ipagM][ipar][3][ieppM]
#else
<< std::setw(14) << hstMomenta[ipagM*npar*np4 + ipar*np4 + 0][ieppM] // AOSOA[ipagM][ipar][0][ieppM]
<< std::setw(14) << hstMomenta[ipagM*npar*np4 + ipar*np4 + 1][ieppM] // AOSOA[ipagM][ipar][1][ieppM]
<< std::setw(14) << hstMomenta[ipagM*npar*np4 + ipar*np4 + 2][ieppM] // AOSOA[ipagM][ipar][2][ieppM]
<< std::setw(14) << hstMomenta[ipagM*npar*np4 + ipar*np4 + 3][ieppM] // AOSOA[ipagM][ipar][3][ieppM]
#endif
<< std::endl
<< std::defaultfloat; // default format: affects all floats
}
std::cout << std::string(SEP79, '-') << std::endl;
// Display matrix elements
std::cout << " Matrix element = "
#ifndef MGONGPU_CPPSIMD
<< hstMEs[ievt]
#else
<< hstMEs[ievt/neppM][ievt%neppM]
#endif
<< " GeV^" << meGeVexponent << std::endl; // FIXME: assume process.nprocesses == 1
std::cout << std::string(SEP79, '-') << std::endl;
}
// Fill the arrays with ALL MEs and weights
#ifndef MGONGPU_CPPSIMD
matrixelementALL[iiter*nevt + ievt] = hstMEs[ievt]; // FIXME: assume process.nprocesses == 1
#else
matrixelementALL[iiter*nevt + ievt] = hstMEs[ievt/neppM][ievt%neppM]; // FIXME: assume process.nprocesses == 1
#endif
weightALL[iiter*nevt + ievt] = hstWeights[ievt];
}

Expand Down
Loading

0 comments on commit 98008b4

Please sign in to comment.