From 27e789cfe1e3bfd895efd89c695c6468d4bb75b8 Mon Sep 17 00:00:00 2001 From: Giovanni Bussi Date: Mon, 11 Mar 2024 12:47:10 +0100 Subject: [PATCH] Extensive improvements in benchmark tool Added: - stop after a maximum time - benchmark domain decomposition runs - comparative analysis on different versions - sleep to simulate MD code calculation --- src/cltools/Benchmark.cpp | 293 +++++++++++++++++++++++++++++++++----- 1 file changed, 258 insertions(+), 35 deletions(-) diff --git a/src/cltools/Benchmark.cpp b/src/cltools/Benchmark.cpp index 6f62fff13d..b2568aaec3 100644 --- a/src/cltools/Benchmark.cpp +++ b/src/cltools/Benchmark.cpp @@ -37,6 +37,7 @@ #include #include #include +#include namespace PLMD { namespace cltools { @@ -49,20 +50,19 @@ The main difference wrt driver is that it generates a trajectory in memory rathe from a file. This allows to better time the overhead of the plumed library, without including the time needed to read the trajectory. -As of now it does not test cases where atoms are scattered over processors (TODO). - It is also possible to load a separate version of the plumed kernel. This enables running -benchmarks agaist previous plumed versions +benchmarks agaist previous plumed versions in a controlled setting, where systematic errors +in the comparison are minimized. \par Examples First, you should create a sample `plumed.dat` file for testing. For instance: -``` + +\plumedfile WHOLEMOLECULES ENTITY0=1-10000 p: POSITION ATOM=1 RESTRAINT ARG=p.x KAPPA=1 AT=0 - -``` +\endplumedfile Then you can test the performance of this input with the following command: \verbatim @@ -76,9 +76,11 @@ plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so \endverbatim \warning It is necessary to use the `plumed-runtime` executable here to avoid conflicts between different -plumed versions. You will find it in `src/lib` if you are using the non installed version of plumed, +plumed versions. You will find it in your path if you are using the non installed version of plumed, and in `$prefix/lib/plumed` if you installed plumed in $prefix,. +\par Comparing multiple versions + The best way to compare two versions of plumed on the same input is to pass multiple colon-separated kernels: \verbatim @@ -96,20 +98,49 @@ two different plumed input files that compute the same thing, you can also use m plumed-runtime benchmark --kernel /path/to/lib/libplumedKernel.so:this --plumed plumed1.dat:plumed2.dat \endverbatim +Similarly, you might want to run two different inputs using the same kernel, which can be obtained with: + +\verbatim +plumed-runtime benchmark --plumed plumed1.dat:plumed2.dat +\endverbatim + \par Profiling If you want to attach a profiler on the fly to the process, you might find it convenient to use `--nsteps -1`. The simulation will run forever and can be interrupted with CTRL-C. When interrupted, the result of the timers should be displayed anyway. +You can also run setting a maximum time with `--maxtime`. + +If you run a profiler when testing multiple PLUMED versions you might be confused by which function is from +each version. It is recommended to recompile separate instances with a separate C++ namespace (`-DPLMD=PLUMED_version_1`) +so that you will be able to distinguish them. In addition, compiling with `CXXFLAGS="-g -O3"` will make the profiling +report more complete, likely including code lines. + +\par MPI runs + +You can run emulating a domain decomposition. This is done automatically if plumed has been compiled with MPI +and you run with `mpirun` + +\verbatim +mpirun -np 4 plumed-runtime benchmark +\endverbatim + +If you load separate PLUMED instances as discussed above, they should all be compiled against the same MPI version. +Notice that when using MPI signals (CTRL-C) might not work. + +Since some of the data transfer could happen asynchronously, you might want to use the `--sleep` option +to simulate a lag between the `prepareCalc` and `performCalc` actions. This part of the calculation will not contribute +to timer, but will obviously slow down your test. \par Output In the output you will see the usual reports about timing produced by the internal timers of the tested plumed instances. In addition, this tool will monitor the timing externally, with some slightly different criterion: -- First, the initialization (construction of the input) will be shown with a separate timer -- Second, the timer corresponding to the calculation will be split in two parts, reporting - execution of the first half of the calculation and the second half. +- First, the initialization (construction of the input) will be shown with a separate timer, + as well as the timing for the first step. +- Second, the timer corresponding to the calculation will be split in three parts, reporting + execution of the first 20% (warm-up) and the next two blocks of 40% each. - Finally, you might notice some discrepancy because some of the actions that are usually not expensive are not included in the internal timers. The external timer will thus provide a better estimate of the total elapsed time, including everything. @@ -117,6 +148,11 @@ In addition, this tool will monitor the timing externally, with some slightly di The internal timers are still useful to monitor what happens at the different stages and, with \ref DEBUG `DETAILED_TIMERS`, what happens in each action. +When you run multiple version, a comparative analisys of the time spent within PLUMED in the various +instances will be done, showing the ratio between the total time and the time measured on the first +instance, which will act as a reference. Errors will be estimated with bootstrapping. The warm-up phase will be discarded for +this analysis. + */ //+ENDPLUMEDOC @@ -163,6 +199,9 @@ struct Kernel { std::string plumed_dat; PlumedHandle handle; Stopwatch stopwatch; + std::vector timings; + double comparative_timing=-1.0; + double comparative_timing_error=-1.0; Log* log=nullptr; Kernel(const std::string & path_,const std::string & plumed_dat, Log* log_): path(path_), @@ -175,8 +214,11 @@ struct Kernel { ~Kernel() { if(log) { (*log)<<"\n"; - (*log)<<"Kernel: "<0.0) { + (*log).printf("Comparative: %.3f +- %.3f\n",comparative_timing,comparative_timing_error); + } } } Kernel(Kernel && other) noexcept: @@ -222,6 +264,9 @@ void Benchmark::registerKeywords( Keywords& keys ) { keys.add("compulsory","--kernel","this","colon separated path(s) to kernel(s)"); keys.add("compulsory","--natoms","100000","the number of atoms to use for the simulation"); keys.add("compulsory","--nsteps","2000","number of steps of MD to perform (-1 means forever)"); + keys.add("compulsory","--maxtime","-1","maximum number of seconds (-1 means forever)"); + keys.add("compulsory","--sleep","0","number of seconds of sleep, mimicking MD calculation"); + keys.addFlag("--domain-decomposition",false,"simulate domain decomposition, implies --shuffle"); keys.addFlag("--shuffled",false,"reshuffle atoms"); } @@ -234,19 +279,105 @@ Benchmark::Benchmark(const CLToolOptions& co ): int Benchmark::main(FILE* in, FILE*out,Communicator& pc) { + std::mt19937 g; // deterministic initialization to avoid issues with MPI + + struct FileDeleter { + void operator()(FILE*f) const noexcept { + if(f) std::fclose(f); + } + }; + + std::unique_ptr log_dev_null{std::fopen("/dev/null","w")}; + Log log; - log.link(out); + if(pc.Get_rank()==0) { + log.link(out); + } else { + log.link(log_dev_null.get()); + } log.setLinePrefix("BENCH: "); std::vector kernels; + // perform comparative analysis // ensure that kernels vector is destroyed from last to first element upon exit - auto kernels_deleter=[](auto f) { while(!f->empty()) f->pop_back();}; - std::unique_ptr kernels_deleter_obj(&kernels,kernels_deleter); + auto kernels_deleter=[&log](auto f) { + if(!f) return; + if(f->empty()) return; + std::mt19937 g; + + auto size=f->back().timings.size(); + constexpr int B=200; + auto numblocks=size; + // for some reasons, blocked bootstrap underestimates error + // For now I keep it off. If I remove it, the code below could be simplified + // if(numblocks>20) numblocks=20; + auto blocksize=size/numblocks; + + if(f->size()<2) { + log<<"Single run, skipping comparative analysis\n"; + } else if(size<10) { + log<<"Too small sample, skipping comparative analysis\n"; + } else try { + + log<<"Running comparative analysis, "< choice(size); + std::uniform_int_distribution<> distrib(0, numblocks-1); + std::vector> blocks(f->size()); + + int i=0; + for(auto it = f->rbegin(); it != f->rend(); ++it) { + int l=0; + blocks[i].assign(numblocks,0); + for(auto j=0; jtimings.size()); + blocks[i][j]+=it->timings[l]; + l++; + } + } + i++; + } + + std::vector> ratios(f->size()); + for(auto & r : ratios) r.resize(B); + + for(unsigned b=0; brbegin(); it != f->rend(); ++it) { + double sum=0.0; + double sum2=0.0; + for(auto r : ratios[i]) { + sum+=r; + sum2+=r*r; + } + it->comparative_timing=sum/B; + it->comparative_timing_error=std::sqrt(sum2/B-sum*sum/(B*B)); + i++; + } + + } catch(std::exception & e) { + log<<"Unexpected error during comparative analysis\n"; + log<empty()) f->pop_back(); + }; + std::unique_ptr kernels_deleter_obj(&kernels,kernels_deleter); - std::random_device rd; - std::mt19937 g(rd()); // construct the kernels vector: { @@ -297,15 +428,27 @@ int Benchmark::main(FILE* in, FILE*out,Communicator& pc) { int nf; parse("--nsteps",nf); unsigned natoms; parse("--natoms",natoms); + double maxtime; parse("--maxtime",maxtime); + + bool domain_decomposition=false; + parseFlag("--domain-decomposition",domain_decomposition); + if(pc.Get_size()>1) domain_decomposition=true; + if(domain_decomposition) shuffled=true; + + double timeToSleep; + parse("--sleep",timeToSleep); + std::vector shuffled_indexes; // trap signals: SignalHandlerGuard sigIntGuard(SIGINT, signalHandler); + auto initial_time=std::chrono::high_resolution_clock::now(); + for(auto & k : kernels) { auto & p(k.handle); auto sw=k.stopwatch.startStop("A Initialization"); - if(Communicator::plumedHasMPI()) p.cmd("setMPIComm",&pc.Get_comm()); + if(Communicator::plumedHasMPI() && domain_decomposition) p.cmd("setMPIComm",&pc.Get_comm()); p.cmd("setRealPrecision",(int)sizeof(double)); p.cmd("setMDLengthUnits",1.0); p.cmd("setMDChargeUnits",1.0); @@ -334,32 +477,112 @@ int Benchmark::main(FILE* in, FILE*out,Communicator& pc) { int plumedStopCondition=0; bool fast_finish=false; + int part=0; + + log<<"Starting MD loop\n"; + log<<"Use CTRL+C to stop at any time and collect timers (not working in MPI runs)\n"; + for(int step=0; nf<0 || stephandle); - const char* sw_name; - if(nf<0) sw_name="B Calculation"; - else if(stepstopwatch.startStop(sw_name); - p.cmd("setStep",step); - p.cmd("setStopFlag",&plumedStopCondition); - p.cmd("setForces",&forces[0][0],3*natoms); - p.cmd("setBox",&cell[0],9); - p.cmd("setVirial",&virial[0],9); - p.cmd("setPositions",&pos[0][0],3*natoms); - p.cmd("setMasses",&masses[0],natoms); - p.cmd("setCharges",&charges[0],natoms); - if(shuffled) { - p.cmd("setAtomsNlocal",natoms); - p.cmd("setAtomsGatindex",&shuffled_indexes[0],shuffled_indexes.size()); + { + auto sw=kernels_ptr[i]->stopwatch.startPause(sw_name); + p.cmd("setStep",step); + p.cmd("setStopFlag",&plumedStopCondition); + p.cmd("setForces",for_ptr,n_local_atoms*3); + p.cmd("setBox",&cell[0],9); + p.cmd("setVirial",&virial[0],9); + p.cmd("setPositions",pos_ptr,n_local_atoms*3); + p.cmd("setMasses",masses_ptr,n_local_atoms); + p.cmd("setCharges",charges_ptr,n_local_atoms); + if(shuffled) { + p.cmd("setAtomsNlocal",n_local_atoms); + p.cmd("setAtomsGatindex",indexes_ptr,n_local_atoms); + } + p.cmd("prepareCalc"); } - p.cmd("calc"); + + // mimick MD calculation here + { + unsigned k=0; + auto start=std::chrono::high_resolution_clock::now(); + while(std::chrono::duration_cast(std::chrono::high_resolution_clock::now()-start).count()<(long long int)1e9*timeToSleep) k+=i*i; + std::fprintf(log_dev_null.get(),"%u",k); + } + + { + auto sw=kernels_ptr[i]->stopwatch.startStop(sw_name); + p.cmd("performCalc"); + } + + if(kernels_ptr.size()>1 && part>1) kernels_ptr[i]->timings.push_back(kernels_ptr[i]->stopwatch.getLastCycle(sw_name)); if(plumedStopCondition || signalReceived.load()) fast_finish=true; } + auto elapsed=std::chrono::duration_cast(std::chrono::high_resolution_clock::now()-initial_time).count(); + if(part==0) part=1; + if(part<2) { + if((maxtime>0 && elapsed>(long long int)(0.2*1e9*maxtime)) || (nf>0 && step+1>=nf/5) || (maxtime<0 & nf<0 && step+1>=100)) { + part=2; + log<<"Warm-up completed\n"; + } + } + if(part<3) { + if((maxtime>0 && elapsed>(long long int)(0.6*1e9*maxtime)) || (nf>0 && step+1>=3*nf/5)) { + part=3; + log<<"60% completed\n"; + } + } + + if(maxtime>0 && elapsed>(long long int)(1e9*maxtime)) fast_finish=true; + + { + unsigned tmp=fast_finish; + pc.Bcast(tmp,0); + fast_finish=tmp; + } if(fast_finish) break; }