Use pinned CPU memory. I get a factor 1.5 better throughput!

time ./check.exe -p 2048 256 12 *********************************** NumIterations = 12 NumThreadsPerBlock = 256 NumBlocksPerGrid = 2048 ----------------------------------- NumberOfEntries = 12 TotalTimeInWaveFuncs = 2.784078e-02 sec MeanTimeInWaveFuncs = 2.320065e-03 sec StdDevTimeInWaveFuncs = 1.567047e-05 sec MinTimeInWaveFuncs = 2.310220e-03 sec MaxTimeInWaveFuncs = 2.370682e-03 sec ----------------------------------- ProcessID: = 23402 NProcesses = 1 NumMatrixElements = 6291456 MatrixElementsPerSec = 2.259799e+08 sec^-1 *********************************** NumMatrixElements = 6291456 MeanMatrixElemValue = 1.371745e-02 GeV^0 StdErrMatrixElemValue = 3.268633e-06 GeV^0 StdDevMatrixElemValue = 8.198638e-03 GeV^0 MinMatrixElemValue = 6.071582e-03 GeV^0 MaxMatrixElemValue = 3.374925e-02 GeV^0 real 0m4.633s user 0m4.055s sys 0m0.562s
roiser · Jul 30, 2020 · 7d27440 · 7d27440
1 parent 189ea2b
commit 7d27440
Showing 1 changed file with 10 additions and 10 deletions.
diff --git a/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cu b/examples/gpu/eemumu_AV/SubProcesses/P1_Sigma_sm_epem_mupmum/check_sa.cu
@@ -92,16 +92,16 @@ int main(int argc, char **argv) {
   double* rnarray = new double[nparf*np4*ndim]; // can be SOA or AOS
 
   int nbytesMomenta = np4*npar*ndim * sizeof(double);
-  double* hstMomenta = new double[npar*np4*ndim]; // SOA[npar][np4][ndim] (previously was: lp)
-  //double* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp)
-  //gpuErrchk3( cudaMallocHost( &hstMomenta, nbytesMomenta ) );
+  //double* hstMomenta = new double[npar*np4*ndim]; // SOA[npar][np4][ndim] (previously was: lp)
+  double* hstMomenta = 0; // SOA[npar][np4][ndim] (previously was: lp)
+  gpuErrchk3( cudaMallocHost( &hstMomenta, nbytesMomenta ) );
   double* devMomenta = 0; // (previously was: allMomenta)
   gpuErrchk3( cudaMalloc( &devMomenta, nbytesMomenta ) );
 
   int nbytesMEs = ndim * sizeof(double);
-  double* hstMEs = new double[ndim]; // (previously was: meHostPtr)
-  //double* hstMEs = 0; // (previously was: meHostPtr)
-  //gpuErrchk3( cudaMallocHost( &hstMEs, nbytesMEs ) );
+  //double* hstMEs = new double[ndim]; // (previously was: meHostPtr)
+  double* hstMEs = 0; // (previously was: meHostPtr)
+  gpuErrchk3( cudaMallocHost( &hstMEs, nbytesMEs ) );
   double* devMEs = 0; // (previously was: meDevPtr)
   gpuErrchk3( cudaMalloc( &devMEs, nbytesMEs ) );
 
@@ -282,10 +282,10 @@ int main(int argc, char **argv) {
   delete[] rmbMomenta;
 #endif
 
-  delete[] hstMEs;
-  delete[] hstMomenta;
-  //gpuErrchk3( cudaFreeHost( hstMEs ) );
-  //gpuErrchk3( cudaFreeHost( hstMomenta ) );
+  //delete[] hstMEs;
+  //delete[] hstMomenta;
+  gpuErrchk3( cudaFreeHost( hstMEs ) );
+  gpuErrchk3( cudaFreeHost( hstMomenta ) );
 
   gpuErrchk3( cudaFree( devMEs ) );
   gpuErrchk3( cudaFree( devMomenta ) );