Skip to content

Commit

Permalink
Use pinned CPU memory. I get a factor 1.5 better throughput!
Browse files Browse the repository at this point in the history
time ./check.exe -p 2048 256 12
***********************************
NumIterations         = 12
NumThreadsPerBlock    = 256
NumBlocksPerGrid      = 2048
-----------------------------------
NumberOfEntries       = 12
TotalTimeInWaveFuncs  = 2.784078e-02 sec
MeanTimeInWaveFuncs   = 2.320065e-03 sec
StdDevTimeInWaveFuncs = 1.567047e-05 sec
MinTimeInWaveFuncs    = 2.310220e-03 sec
MaxTimeInWaveFuncs    = 2.370682e-03 sec
-----------------------------------
ProcessID:            = 23402
NProcesses            = 1
NumMatrixElements     = 6291456
MatrixElementsPerSec  = 2.259799e+08 sec^-1
***********************************
NumMatrixElements     = 6291456
MeanMatrixElemValue   = 1.371745e-02 GeV^0
StdErrMatrixElemValue = 3.268633e-06 GeV^0
StdDevMatrixElemValue = 8.198638e-03 GeV^0
MinMatrixElemValue    = 6.071582e-03 GeV^0
MaxMatrixElemValue    = 3.374925e-02 GeV^0

real    0m4.633s
user    0m4.055s
sys     0m0.562s
  • Loading branch information
valassi committed Jul 30, 2020
1 parent 189ea2b commit 7d27440
Showing 1 changed file with 10 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -92,16 +92,16 @@ int main(int argc, char **argv) {
double* rnarray = new double[nparf*np4*ndim]; // can be SOA or AOS

int nbytesMomenta = np4*npar*ndim * sizeof(double);
double* hstMomenta = new double[npar*np4*ndim]; // SOA[npar][np4][ndim] (previously was: lp)
//double* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp)
//gpuErrchk3( cudaMallocHost( &hstMomenta, nbytesMomenta ) );
//double* hstMomenta = new double[npar*np4*ndim]; // SOA[npar][np4][ndim] (previously was: lp)
double* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp)
gpuErrchk3( cudaMallocHost( &hstMomenta, nbytesMomenta ) );
double* devMomenta = 0; // (previously was: allMomenta)
gpuErrchk3( cudaMalloc( &devMomenta, nbytesMomenta ) );

int nbytesMEs = ndim * sizeof(double);
double* hstMEs = new double[ndim]; // (previously was: meHostPtr)
//double* hstMEs = 0; // (previously was: meHostPtr)
//gpuErrchk3( cudaMallocHost( &hstMEs, nbytesMEs ) );
//double* hstMEs = new double[ndim]; // (previously was: meHostPtr)
double* hstMEs = 0; // (previously was: meHostPtr)
gpuErrchk3( cudaMallocHost( &hstMEs, nbytesMEs ) );
double* devMEs = 0; // (previously was: meDevPtr)
gpuErrchk3( cudaMalloc( &devMEs, nbytesMEs ) );

Expand Down Expand Up @@ -282,10 +282,10 @@ int main(int argc, char **argv) {
delete[] rmbMomenta;
#endif

delete[] hstMEs;
delete[] hstMomenta;
//gpuErrchk3( cudaFreeHost( hstMEs ) );
//gpuErrchk3( cudaFreeHost( hstMomenta ) );
//delete[] hstMEs;
//delete[] hstMomenta;
gpuErrchk3( cudaFreeHost( hstMEs ) );
gpuErrchk3( cudaFreeHost( hstMomenta ) );

gpuErrchk3( cudaFree( devMEs ) );
gpuErrchk3( cudaFree( devMomenta ) );
Expand Down

0 comments on commit 7d27440

Please sign in to comment.