From 27e789cfe1e3bfd895efd89c695c6468d4bb75b8 Mon Sep 17 00:00:00 2001
From: Giovanni Bussi <giovanni.bussi@gmail.com>
Date: Mon, 11 Mar 2024 12:47:10 +0100
Subject: [PATCH] Extensive improvements in benchmark tool

Added:

- stop after a maximum time
- benchmark domain decomposition runs
- comparative analysis on different versions
- sleep to simulate MD code calculation
---
 src/cltools/Benchmark.cpp | 293 +++++++++++++++++++++++++++++++++-----
 1 file changed, 258 insertions(+), 35 deletions(-)
diff --git a/src/cltools/Benchmark.cpp b/src/cltools/Benchmark.cpp
index 6f62fff13d..b2568aaec3 100644
--- a/src/cltools/Benchmark.cpp
+++ b/src/cltools/Benchmark.cpp
@@ -37,6 +37,7 @@
 #include <cstdio>
 #include <random>
 #include <algorithm>
+#include <chrono>
 
 namespace PLMD {
 namespace cltools {
@@ -49,20 +50,19 @@ The main difference wrt driver is that it generates a trajectory in memory rathe
 from a file. This allows to better time the overhead of the plumed library, without including
 the time needed to read the trajectory.
 
-As of now it does not test cases where atoms are scattered over processors (TODO).
-
 It is also possible to load a separate version of the plumed kernel. This enables running
-benchmarks agaist previous plumed versions
+benchmarks agaist previous plumed versions in a controlled setting, where systematic errors
+in the comparison are minimized.
 
 \par Examples
 
 First, you should create a sample `plumed.dat` file for testing. For instance:
-```
+
+\plumedfile
 WHOLEMOLECULES ENTITY0=1-10000
 p: POSITION ATOM=1
 RESTRAINT ARG=p.x KAPPA=1 AT=0
-
-```
+\endplumedfile
 
 Then you can test the performance of this input with the following command:
 \verbatim
@@ -76,9 +76,11 @@ plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so
 \endverbatim
 
 \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different
-plumed versions. You will find it in `src/lib` if you are using the non installed version of plumed,
+plumed versions. You will find it in your path if you are using the non installed version of plumed,
 and in `$prefix/lib/plumed` if you installed plumed in $prefix,.
 
+\par Comparing multiple versions
+
 The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels:
 
 \verbatim
@@ -96,20 +98,49 @@ two different plumed input files that compute the same thing, you can also use m
 plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat
 \endverbatim
 
+Similarly, you might want to run two different inputs using the same kernel, which can be obtained with:
+
+\verbatim
+plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat
+\endverbatim
+
 \par Profiling
 
 If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`.
 The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers
 should be displayed anyway.
+You can also run setting a maximum time with `--maxtime`.
+
+If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from
+each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`)
+so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling
+report more complete, likely including code lines.
+
+\par MPI runs
+
+You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI
+and you run with `mpirun`
+
+\verbatim
+mpirun -np 4 plumed-runtime benchmark
+\endverbatim
+
+If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version.
+Notice that when using MPI signals (CTRL-C) might not work.
+
+Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option
+to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute
+to timer, but will obviously slow down your test.
 
 \par Output
 
 In the output you will see the usual reports about timing produced by the internal
 timers of the tested plumed instances.
 In addition, this tool will monitor the timing externally, with some slightly different criterion:
-- First, the initialization (construction of the input) will be shown with a separate timer
-- Second, the timer corresponding to the calculation will be split in two parts, reporting
-  execution of the first half of the calculation and the second half.
+- First, the initialization (construction of the input) will be shown with a separate timer,
+  as well as the timing for the first step.
+- Second, the timer corresponding to the calculation will be split in three parts, reporting
+  execution of the first 20% (warm-up) and the next two blocks of 40% each.
 - Finally, you might notice some discrepancy because some of the actions that are usually
   not expensive are not included in the internal timers. The external timer will
   thus provide a better estimate of the total elapsed time, including everything.
@@ -117,6 +148,11 @@ In addition, this tool will monitor the timing externally, with some slightly di
 The internal timers are still useful to monitor what happens at the different stages
 and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action.
 
+When you run multiple version, a comparative analisys of the time spent within PLUMED in the various
+instances will be done, showing the ratio between the total time and the time measured on the first
+instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for
+this analysis.
+
 */
 //+ENDPLUMEDOC
 
@@ -163,6 +199,9 @@ struct Kernel {
   std::string plumed_dat;
   PlumedHandle handle;
   Stopwatch stopwatch;
+  std::vector<long long int> timings;
+  double comparative_timing=-1.0;
+  double comparative_timing_error=-1.0;
   Log* log=nullptr;
   Kernel(const std::string & path_,const std::string & plumed_dat, Log* log_):
     path(path_),
@@ -175,8 +214,11 @@ struct Kernel {
   ~Kernel() {
     if(log) {
       (*log)<<"\n";
-      (*log)<<"Kernel: "<<path<<"\n";
-      (*log)<<"Input:  "<<plumed_dat<<"\n";
+      (*log)        <<"Kernel:      "<<path<<"\n";
+      (*log)        <<"Input:       "<<plumed_dat<<"\n";
+      if(comparative_timing>0.0) {
+        (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error);
+      }
     }
   }
   Kernel(Kernel && other) noexcept:
@@ -222,6 +264,9 @@ void Benchmark::registerKeywords( Keywords& keys ) {
   keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)");
   keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation");
   keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)");
+  keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)");
+  keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation");
+  keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle");
   keys.addFlag("--shuffled",false,"reshuffle atoms");
 }
 
@@ -234,19 +279,105 @@ Benchmark::Benchmark(const CLToolOptions& co ):
 
 int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
 
+  std::mt19937 g; // deterministic initialization to avoid issues with MPI
+
+  struct FileDeleter {
+    void operator()(FILE*f) const noexcept {
+      if(f) std::fclose(f);
+    }
+  };
+
+  std::unique_ptr<FILE,FileDeleter> log_dev_null{std::fopen("/dev/null","w")};
+
   Log log;
-  log.link(out);
+  if(pc.Get_rank()==0) {
+    log.link(out);
+  } else {
+    log.link(log_dev_null.get());
+  }
   log.setLinePrefix("BENCH:  ");
 
   std::vector<Kernel> kernels;
 
+  // perform comparative analysis
   // ensure that kernels vector is destroyed from last to first element upon exit
-  auto kernels_deleter=[](auto f) { while(!f->empty()) f->pop_back();};
-  std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
+  auto kernels_deleter=[&log](auto f) {
+    if(!f) return;
+    if(f->empty()) return;
+    std::mt19937 g;
+
+    auto size=f->back().timings.size();
+    constexpr int B=200;
+    auto numblocks=size;
+    // for some reasons, blocked bootstrap underestimates error
+    // For now I keep it off. If I remove it, the code below could be simplified
+    // if(numblocks>20) numblocks=20;
+    auto blocksize=size/numblocks;
+
+    if(f->size()<2) {
+      log<<"Single run, skipping comparative analysis\n";
+    } else if(size<10) {
+      log<<"Too small sample, skipping comparative analysis\n";
+    } else try {
+
+        log<<"Running comparative analysis, "<<numblocks<<" blocks with size "<<blocksize<<"\n";
+
+        std::vector<std::size_t> choice(size);
+        std::uniform_int_distribution<> distrib(0, numblocks-1);
+        std::vector<std::vector<long long int>> blocks(f->size());
+
+        int i=0;
+        for(auto it = f->rbegin(); it != f->rend(); ++it) {
+          int l=0;
+          blocks[i].assign(numblocks,0);
+          for(auto j=0; j<numblocks; j++) {
+            for(auto k=0; k<blocksize; k++) {
+              plumed_assert(l<it->timings.size());
+              blocks[i][j]+=it->timings[l];
+              l++;
+            }
+          }
+          i++;
+        }
+
+        std::vector<std::vector<double>> ratios(f->size());
+        for(auto & r : ratios) r.resize(B);
+
+        for(unsigned b=0; b<B; b++) {
+          for(auto & c : choice) c=distrib(g);
+          long long int reference=0;
+          for(auto & c : choice) reference+=blocks[0][c];
+          for(auto i=0; i<blocks.size(); i++) {
+            long long int estimate=0;
+            // this would lead to separate bootstrap samples for each estimate:
+            // for(auto & c : choice) c=distrib(g);
+            for(auto & c : choice) estimate+=blocks[i][c];
+            ratios[i][b]=double(estimate)/double(reference);
+          }
+        }
+
+        i=0;
+        for(auto it = f->rbegin(); it != f->rend(); ++it) {
+          double sum=0.0;
+          double sum2=0.0;
+          for(auto r : ratios[i]) {
+            sum+=r;
+            sum2+=r*r;
+          }
+          it->comparative_timing=sum/B;
+          it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B));
+          i++;
+        }
+
+      } catch(std::exception & e) {
+        log<<"Unexpected error during comparative analysis\n";
+        log<<e.what()<<"\n";
+      }
+    while(!f->empty()) f->pop_back();
 
+  };
+  std::unique_ptr<decltype(kernels),decltype(kernels_deleter)> kernels_deleter_obj(&kernels,kernels_deleter);
 
-  std::random_device rd;
-  std::mt19937 g(rd());
 
   // construct the kernels vector:
   {
@@ -297,15 +428,27 @@ int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
   int nf; parse("--nsteps",nf);
   unsigned natoms; parse("--natoms",natoms);
 
+  double maxtime; parse("--maxtime",maxtime);
+
+  bool domain_decomposition=false;
+  parseFlag("--domain-decomposition",domain_decomposition);
+  if(pc.Get_size()>1) domain_decomposition=true;
+  if(domain_decomposition) shuffled=true;
+
+  double timeToSleep;
+  parse("--sleep",timeToSleep);
+
   std::vector<int> shuffled_indexes;
 
   // trap signals:
   SignalHandlerGuard sigIntGuard(SIGINT, signalHandler);
 
+  auto initial_time=std::chrono::high_resolution_clock::now();
+
   for(auto & k : kernels) {
     auto & p(k.handle);
     auto sw=k.stopwatch.startStop("A Initialization");
-    if(Communicator::plumedHasMPI()) p.cmd("setMPIComm",&pc.Get_comm());
+    if(Communicator::plumedHasMPI() && domain_decomposition) p.cmd("setMPIComm",&pc.Get_comm());
     p.cmd("setRealPrecision",(int)sizeof(double));
     p.cmd("setMDLengthUnits",1.0);
     p.cmd("setMDChargeUnits",1.0);
@@ -334,32 +477,112 @@ int Benchmark::main(FILE* in, FILE*out,Communicator& pc) {
 
   int plumedStopCondition=0;
   bool fast_finish=false;
+  int part=0;
+
+  log<<"Starting MD loop\n";
+  log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n";
+
   for(int step=0; nf<0 || step<nf; ++step) {
     std::shuffle(kernels_ptr.begin(),kernels_ptr.end(),g);
     for(unsigned j=0; j<natoms; ++j) pos[j] = Vector(step*j, step*j+1, step*j+2);
+    double* pos_ptr;
+    double* for_ptr;
+    double* charges_ptr;
+    double* masses_ptr;
+    int* indexes_ptr=nullptr;
+    int n_local_atoms;
+
+    if(domain_decomposition) {
+      auto nproc=pc.Get_size();
+      auto nn=natoms/nproc;
+      auto excess=natoms%nproc;
+      auto myrank=pc.Get_rank();
+      auto shift=0;
+      n_local_atoms=nn;
+      if(myrank<excess) n_local_atoms+=1;
+      for(unsigned i=0; i<myrank; i++) {
+        shift+=nn;
+        if(i<excess) shift+=1;
+      }
+      pos_ptr=&pos[shift][0];
+      for_ptr=&forces[shift][0];
+      charges_ptr=&charges[shift];
+      masses_ptr=&masses[shift];
+      indexes_ptr=shuffled_indexes.data()+shift;
+    } else {
+      pos_ptr=&pos[0][0];
+      for_ptr=&forces[0][0];
+      charges_ptr=&charges[0];
+      masses_ptr=&masses[0];
+      n_local_atoms=natoms;
+      indexes_ptr=shuffled_indexes.data();
+    }
+
+    const char* sw_name;
+    if(part==0)      sw_name="B0 First step";
+    else if(part==1) sw_name="B1 Warm-up";
+    else if(part==2) sw_name="B2 Calculation part 1";
+    else             sw_name="B3 Calculation part 2";
+
+
     for(unsigned i=0; i<kernels_ptr.size(); i++) {
       auto & p(kernels_ptr[i]->handle);
 
-      const char* sw_name;
-      if(nf<0) sw_name="B Calculation";
-      else if(step<nf/2) sw_name="B1 Calculation part 1";
-      else sw_name="B2 Calculation part 2";
-      auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
-      p.cmd("setStep",step);
-      p.cmd("setStopFlag",&plumedStopCondition);
-      p.cmd("setForces",&forces[0][0],3*natoms);
-      p.cmd("setBox",&cell[0],9);
-      p.cmd("setVirial",&virial[0],9);
-      p.cmd("setPositions",&pos[0][0],3*natoms);
-      p.cmd("setMasses",&masses[0],natoms);
-      p.cmd("setCharges",&charges[0],natoms);
-      if(shuffled) {
-        p.cmd("setAtomsNlocal",natoms);
-        p.cmd("setAtomsGatindex",&shuffled_indexes[0],shuffled_indexes.size());
+      {
+        auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name);
+        p.cmd("setStep",step);
+        p.cmd("setStopFlag",&plumedStopCondition);
+        p.cmd("setForces",for_ptr,n_local_atoms*3);
+        p.cmd("setBox",&cell[0],9);
+        p.cmd("setVirial",&virial[0],9);
+        p.cmd("setPositions",pos_ptr,n_local_atoms*3);
+        p.cmd("setMasses",masses_ptr,n_local_atoms);
+        p.cmd("setCharges",charges_ptr,n_local_atoms);
+        if(shuffled) {
+          p.cmd("setAtomsNlocal",n_local_atoms);
+          p.cmd("setAtomsGatindex",indexes_ptr,n_local_atoms);
+        }
+        p.cmd("prepareCalc");
       }
-      p.cmd("calc");
+
+      // mimick MD calculation here
+      {
+        unsigned k=0;
+        auto start=std::chrono::high_resolution_clock::now();
+        while(std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) k+=i*i;
+        std::fprintf(log_dev_null.get(),"%u",k);
+      }
+
+      {
+        auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name);
+        p.cmd("performCalc");
+      }
+
+      if(kernels_ptr.size()>1 && part>1) kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name));
       if(plumedStopCondition || signalReceived.load()) fast_finish=true;
     }
+    auto elapsed=std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now()-initial_time).count();
+    if(part==0) part=1;
+    if(part<2) {
+      if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 & nf<0 && step+1>=100)) {
+        part=2;
+        log<<"Warm-up completed\n";
+      }
+    }
+    if(part<3) {
+      if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) {
+        part=3;
+        log<<"60% completed\n";
+      }
+    }
+
+    if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) fast_finish=true;
+
+    {
+      unsigned tmp=fast_finish;
+      pc.Bcast(tmp,0);
+      fast_finish=tmp;
+    }
     if(fast_finish) break;
   }