From 147c6a2555b5620a68d1744c0cbeb4ece4971380 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Sun, 21 Apr 2024 19:53:28 +0200 Subject: [PATCH 1/4] GPU: Fix standalone compilation without O2 headers --- GPU/GPUTracking/Base/cuda/CMakeLists.txt | 2 +- GPU/GPUTracking/Base/hip/CMakeLists.txt | 2 +- GPU/GPUTracking/cmake/kernel_helpers.cmake | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/CMakeLists.txt b/GPU/GPUTracking/Base/cuda/CMakeLists.txt index 7f2a4d315867c..6bade1363b7ce 100644 --- a/GPU/GPUTracking/Base/cuda/CMakeLists.txt +++ b/GPU/GPUTracking/Base/cuda/CMakeLists.txt @@ -197,7 +197,7 @@ if(NOT GPUCA_CUDA_COMPILE_MODE STREQUAL "rdc") set_target_properties(${targetName} PROPERTIES LINKER_LANGUAGE CXX) endif() -if(NOT ALIGPU_BUILD_TYPE STREQUAL "ALIROOT") +if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) add_library(GPUTrackingCUDAExternalProvider OBJECT GPUReconstructionCUDAExternalProvider.cu) add_library(O2::GPUTrackingCUDAExternalProvider ALIAS GPUTrackingCUDAExternalProvider) set_property(TARGET GPUTrackingCUDAExternalProvider PROPERTY CUDA_SEPARABLE_COMPILATION ON) diff --git a/GPU/GPUTracking/Base/hip/CMakeLists.txt b/GPU/GPUTracking/Base/hip/CMakeLists.txt index c4e818c31dc0c..151eb5ae3930a 100644 --- a/GPU/GPUTracking/Base/hip/CMakeLists.txt +++ b/GPU/GPUTracking/Base/hip/CMakeLists.txt @@ -260,7 +260,7 @@ if(NOT GPUCA_HIP_COMPILE_MODE STREQUAL "rdc") target_link_options(${targetName} PRIVATE $<$:-fno-gpu-rdc>) endif() -if(NOT ALIGPU_BUILD_TYPE STREQUAL "ALIROOT") +if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS) add_library(GPUTrackingHIPExternalProvider OBJECT ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPExternalProvider.hip) add_library(O2::GPUTrackingHIPExternalProvider ALIAS GPUTrackingHIPExternalProvider) target_compile_options(GPUTrackingHIPExternalProvider PRIVATE $<$:-fgpu-rdc>) diff --git a/GPU/GPUTracking/cmake/kernel_helpers.cmake b/GPU/GPUTracking/cmake/kernel_helpers.cmake index 2da1f13de517e..30fe2850ff3eb 100644 --- a/GPU/GPUTracking/cmake/kernel_helpers.cmake +++ b/GPU/GPUTracking/cmake/kernel_helpers.cmake @@ -83,6 +83,7 @@ function(o2_gpu_add_kernel kernel_name kernel_files kernel_bounds kernel_type) endif() list(APPEND O2_GPU_KERNEL_TEMPLATE_FILES "${TMP_KERNEL_CLASS_FILE}.cxx") list(REMOVE_DUPLICATES O2_GPU_KERNEL_TEMPLATE_FILES) + list(FILTER O2_GPU_KERNEL_TEMPLATE_FILES EXCLUDE REGEX "^-$") list(TRANSFORM O2_GPU_KERNEL_TEMPLATE_FILES APPEND "\"") list(TRANSFORM O2_GPU_KERNEL_TEMPLATE_FILES PREPEND "#include \"") list(JOIN O2_GPU_KERNEL_TEMPLATE_FILES "\n" O2_GPU_KERNEL_TEMPLATE_FILES) From d8e364004916f062e5b527b5fa0fad56f5ee2517 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Sun, 21 Apr 2024 19:53:59 +0200 Subject: [PATCH 2/4] GPU: Add cache folder and file lock options to RTC --- .../Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 31 +++++++++++++++++-- GPU/GPUTracking/Definitions/GPUSettingsList.h | 4 +++ .../Standalone/Benchmark/standalone.cxx | 4 +-- GPU/Workflow/src/GPUWorkflowSpec.cxx | 6 ++-- 4 files changed, 38 insertions(+), 7 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index d8383e870f7ac..5a4ee1ea01e25 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -23,6 +23,9 @@ #ifdef GPUCA_HAVE_O2HEADERS #include "Framework/SHA1.h" #endif +#include +#include +#include using namespace GPUCA_NAMESPACE::gpu; @@ -62,11 +65,27 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile) nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1; bool cacheLoaded = false; + int fd = 0; if (mProcessingSettings.rtc.cacheOutput) { + if (mProcessingSettings.RTCcacheFolder != ".") { + std::filesystem::create_directories(mProcessingSettings.RTCcacheFolder); + } #ifndef GPUCA_HAVE_O2HEADERS throw std::runtime_error("Cannot use RTC cache without O2 headers"); #else - FILE* fp = fopen("rtc.cuda.cache", "rb"); + if (mProcessingSettings.rtc.cacheMutex) { + mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; + fd = open((mProcessingSettings.RTCcacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask); + if (fd == -1) { + throw std::runtime_error("Error opening rtc cache mutex lock file"); + } + fchmod(fd, mask); + if (lockf(fd, F_LOCK, 0)) { + throw std::runtime_error("Error locking rtc cache mutex file"); + } + } + + FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "rb"); char sharead[20]; if (fp) { size_t len; @@ -100,6 +119,7 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile) break; } GPUSettingsProcessingRTC cachedSettings; + static_assert(std::is_trivially_copyable_v == true, "GPUSettingsProcessingRTC must be POD"); if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) { throw std::runtime_error("Cache file corrupt"); } @@ -183,7 +203,7 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile) } #ifdef GPUCA_HAVE_O2HEADERS if (mProcessingSettings.rtc.cacheOutput) { - FILE* fp = fopen("rtc.cuda.cache", "w+b"); + FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "w+b"); if (fp == nullptr) { throw std::runtime_error("Cannot open cache file for writing"); } @@ -218,9 +238,16 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile) } } fclose(fp); + } #endif } + if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtc.cacheMutex) { + if (lockf(fd, F_ULOCK, 0)) { + throw std::runtime_error("Error unlocking RTC cache mutex file"); + } + close(fd); + } #endif return 0; diff --git a/GPU/GPUTracking/Definitions/GPUSettingsList.h b/GPU/GPUTracking/Definitions/GPUSettingsList.h index 982aaaa5ed69d..3175b088d8440 100644 --- a/GPU/GPUTracking/Definitions/GPUSettingsList.h +++ b/GPU/GPUTracking/Definitions/GPUSettingsList.h @@ -191,6 +191,7 @@ AddSubConfig(GPUSettingsRecTRD, trd) AddHelp("help", 'h') EndConfig() +#ifndef __OPENCL__ // Settings steering the processing once the device was selected, only available on the host BeginSubConfig(GPUSettingsProcessingRTC, rtc, configStandalone.proc, "RTC", 0, "Processing settings", proc_rtc) AddOption(cacheOutput, bool, false, "", 0, "Cache RTC compilation results") @@ -198,6 +199,7 @@ AddOption(optConstexpr, bool, true, "", 0, "Replace constant variables by static AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel") AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code") AddOption(runTest, int, 0, "", 0, "Do not run the actual benchmark, but just test RTC compilation (1 full test, 2 test only compilation)") +AddOption(cacheMutex, bool, true, "", 0, "Use a file lock to serialize access to the cache folder") AddHelp("help", 'h') EndConfig() @@ -276,11 +278,13 @@ AddOption(tpcSingleSector, int, -1, "", 0, "Restrict TPC processing to a single AddOption(tpcDownscaledEdx, unsigned char, 0, "", 0, "If != 0, downscale dEdx processing (if enabled) to x %") AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow") AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding") +AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored") AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr) AddSubConfig(GPUSettingsProcessingRTC, rtc) AddSubConfig(GPUSettingsProcessingParam, param) AddHelp("help", 'h') EndConfig() +#endif // __OPENCL__ #ifndef GPUCA_GPUCODE_DEVICE // Light settings concerning the event display (can be changed without rebuilding vertices) diff --git a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx index 4e3cff32c5bc9..92199f06a5f68 100644 --- a/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx +++ b/GPU/GPUTracking/Standalone/Benchmark/standalone.cxx @@ -297,8 +297,8 @@ int SetupReconstruction() GPUSettingsGRP grp = rec->GetGRPSettings(); GPUSettingsRec recSet; GPUSettingsProcessing procSet; - memcpy((void*)&recSet, (void*)&configStandalone.rec, sizeof(GPUSettingsRec)); - memcpy((void*)&procSet, (void*)&configStandalone.proc, sizeof(GPUSettingsProcessing)); + recSet = configStandalone.rec; + procSet = configStandalone.proc; GPURecoStepConfiguration steps; if (configStandalone.eventGenerator) { diff --git a/GPU/Workflow/src/GPUWorkflowSpec.cxx b/GPU/Workflow/src/GPUWorkflowSpec.cxx index 36ce6da419d4b..af65de4ce5c48 100644 --- a/GPU/Workflow/src/GPUWorkflowSpec.cxx +++ b/GPU/Workflow/src/GPUWorkflowSpec.cxx @@ -321,11 +321,11 @@ void GPURecoWorkflowSpec::init(InitContext& ic) mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH; fd = open("/tmp/o2_gpu_memlock_mutex.lock", O_RDWR | O_CREAT | O_CLOEXEC, mask); if (fd == -1) { - throw std::runtime_error("Error opening lock file"); + throw std::runtime_error("Error opening memlock mutex lock file"); } fchmod(fd, mask); if (lockf(fd, F_LOCK, 0)) { - throw std::runtime_error("Error locking file"); + throw std::runtime_error("Error locking memlock mutex file"); } } std::chrono::time_point start, end; @@ -342,7 +342,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic) } if (mConfParam->mutexMemReg) { if (lockf(fd, F_ULOCK, 0)) { - throw std::runtime_error("Error unlocking file"); + throw std::runtime_error("Error unlocking memlock mutex file"); } close(fd); } From 930951cb9bf1a9f2a9042e3d8fea3ae3ac2c7bbb Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 22 Apr 2024 14:04:29 +0200 Subject: [PATCH 3/4] GPU CMake: Fix -W option must be passed only to compiler, not to device compilation --- GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx | 1 - dependencies/FindO2GPU.cmake | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx index 5a4ee1ea01e25..b291bf735aee3 100644 --- a/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx +++ b/GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx @@ -238,7 +238,6 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile) } } fclose(fp); - } #endif } diff --git a/dependencies/FindO2GPU.cmake b/dependencies/FindO2GPU.cmake index 302f5ab93209f..bd244fcaaecbe 100644 --- a/dependencies/FindO2GPU.cmake +++ b/dependencies/FindO2GPU.cmake @@ -111,7 +111,7 @@ if(ENABLE_CUDA) endif() endif() if(CMAKE_CUDA_COMPILER) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" --expt-relaxed-constexpr --extended-lambda --allow-unsupported-compiler -Xptxas -v -Wno-attributes") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" --expt-relaxed-constexpr --extended-lambda --allow-unsupported-compiler -Xptxas -v -Xcompiler -Wno-attributes") if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.3") string(APPEND CMAKE_CUDA_FLAGS " -Xcudafe --diag_suppress=20257") # TODO: Cleanup endif() From 0f647fcdb6bee76d7f57c95288c3bb9897095dc4 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 22 Apr 2024 14:07:54 +0200 Subject: [PATCH 4/4] dpl-workflow: Add option to use GPU RTC --- prodtests/full-system-test/dpl-workflow.sh | 1 + prodtests/full-system-test/start_tmux.sh | 7 ++----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/prodtests/full-system-test/dpl-workflow.sh b/prodtests/full-system-test/dpl-workflow.sh index f67c5c01e81f1..5b5d84c413abe 100755 --- a/prodtests/full-system-test/dpl-workflow.sh +++ b/prodtests/full-system-test/dpl-workflow.sh @@ -320,6 +320,7 @@ if has_detector_calib PHS && workflow_has_parameter CALIB; then fi [[ ${O2_GPU_DOUBLE_PIPELINE:-$EPNSYNCMODE} == 1 ]] && GPU_CONFIG+=" --enableDoublePipeline" +[[ ${O2_GPU_RTC:-0} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc_rtc.enable=1;GPU_proc_rtc.cacheOutput=1;GPU_proc.RTCcacheFolder=/tmp/o2_gpu_rtc_cache;" ( workflow_has_parameter AOD || [[ -z "$DISABLE_ROOT_OUTPUT" ]] || needs_root_output o2-emcal-cell-writer-workflow ) && has_detector EMC && RAW_EMC_SUBSPEC=" --subspecification 1 " has_detector_reco MID && has_detector_matching MCHMID && MFTMCHConf="FwdMatching.useMIDMatch=true;" || MFTMCHConf="FwdMatching.useMIDMatch=false;" diff --git a/prodtests/full-system-test/start_tmux.sh b/prodtests/full-system-test/start_tmux.sh index e56514196afe3..22b0ce2ddcd2a 100755 --- a/prodtests/full-system-test/start_tmux.sh +++ b/prodtests/full-system-test/start_tmux.sh @@ -97,19 +97,16 @@ fi FST_SLEEP0=0 FST_SLEEP1=0 -FST_SLEEP2=45 +FST_SLEEP2=30 if [[ -z $SHM_MANAGER_SHMID ]]; then rm -f /dev/shm/*fmq* if [[ `ls /dev/shm/*fmq* 2> /dev/null | wc -l` != "0" ]]; then echo "FMQ SHM files left which cannot be deleted, please clean up!" exit 1 fi -else - FST_SLEEP0=0 - FST_SLEEP1=0 - FST_SLEEP2=30 fi [[ ! -z $FST_TMUX_DD_WAIT ]] && FST_SLEEP2=$FST_TMUX_DD_WAIT +[[ ${O2_GPU_RTC:-0} == 1 ]] && FST_SLEEP2=60 if workflow_has_parameter CALIB_PROXIES; then CALIB_COMMAND="$GEN_TOPO_MYDIR/aggregator-workflow.sh"