Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU: Possibility to use GPU RTC with caching and multiple GPUs from dpl-workflow.sh and unrelated fixes #13063

Merged
merged 4 commits into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/cuda/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ if(NOT GPUCA_CUDA_COMPILE_MODE STREQUAL "rdc")
set_target_properties(${targetName} PROPERTIES LINKER_LANGUAGE CXX)
endif()

if(NOT ALIGPU_BUILD_TYPE STREQUAL "ALIROOT")
if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
add_library(GPUTrackingCUDAExternalProvider OBJECT GPUReconstructionCUDAExternalProvider.cu)
add_library(O2::GPUTrackingCUDAExternalProvider ALIAS GPUTrackingCUDAExternalProvider)
set_property(TARGET GPUTrackingCUDAExternalProvider PROPERTY CUDA_SEPARABLE_COMPILATION ON)
Expand Down
30 changes: 28 additions & 2 deletions GPU/GPUTracking/Base/cuda/GPUReconstructionCUDAGenRTC.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@
#ifdef GPUCA_HAVE_O2HEADERS
#include "Framework/SHA1.h"
#endif
#include <sys/stat.h>
#include <fcntl.h>
#include <filesystem>

using namespace GPUCA_NAMESPACE::gpu;

Expand Down Expand Up @@ -62,11 +65,27 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile)

nCompile = mProcessingSettings.rtc.compilePerKernel ? kernels.size() : 1;
bool cacheLoaded = false;
int fd = 0;
if (mProcessingSettings.rtc.cacheOutput) {
if (mProcessingSettings.RTCcacheFolder != ".") {
std::filesystem::create_directories(mProcessingSettings.RTCcacheFolder);
}
#ifndef GPUCA_HAVE_O2HEADERS
throw std::runtime_error("Cannot use RTC cache without O2 headers");
#else
FILE* fp = fopen("rtc.cuda.cache", "rb");
if (mProcessingSettings.rtc.cacheMutex) {
mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
fd = open((mProcessingSettings.RTCcacheFolder + "/cache.lock").c_str(), O_RDWR | O_CREAT | O_CLOEXEC, mask);
if (fd == -1) {
throw std::runtime_error("Error opening rtc cache mutex lock file");
}
fchmod(fd, mask);
if (lockf(fd, F_LOCK, 0)) {
throw std::runtime_error("Error locking rtc cache mutex file");
}
}

FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "rb");
char sharead[20];
if (fp) {
size_t len;
Expand Down Expand Up @@ -100,6 +119,7 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile)
break;
}
GPUSettingsProcessingRTC cachedSettings;
static_assert(std::is_trivially_copyable_v<GPUSettingsProcessingRTC> == true, "GPUSettingsProcessingRTC must be POD");
if (fread(&cachedSettings, sizeof(cachedSettings), 1, fp) != 1) {
throw std::runtime_error("Cache file corrupt");
}
Expand Down Expand Up @@ -183,7 +203,7 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile)
}
#ifdef GPUCA_HAVE_O2HEADERS
if (mProcessingSettings.rtc.cacheOutput) {
FILE* fp = fopen("rtc.cuda.cache", "w+b");
FILE* fp = fopen((mProcessingSettings.RTCcacheFolder + "/rtc.cuda.cache").c_str(), "w+b");
if (fp == nullptr) {
throw std::runtime_error("Cannot open cache file for writing");
}
Expand Down Expand Up @@ -221,6 +241,12 @@ int GPUReconstructionCUDA::genRTC(std::string& filename, unsigned int& nCompile)
}
#endif
}
if (mProcessingSettings.rtc.cacheOutput && mProcessingSettings.rtc.cacheMutex) {
if (lockf(fd, F_ULOCK, 0)) {
throw std::runtime_error("Error unlocking RTC cache mutex file");
}
close(fd);
}

#endif
return 0;
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Base/hip/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ if(NOT GPUCA_HIP_COMPILE_MODE STREQUAL "rdc")
target_link_options(${targetName} PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-fno-gpu-rdc>)
endif()

if(NOT ALIGPU_BUILD_TYPE STREQUAL "ALIROOT")
if(ALIGPU_BUILD_TYPE STREQUAL "O2" OR CONFIG_O2_EXTENSIONS)
add_library(GPUTrackingHIPExternalProvider OBJECT ${GPUCA_HIP_SOURCE_DIR}/GPUReconstructionHIPExternalProvider.hip)
add_library(O2::GPUTrackingHIPExternalProvider ALIAS GPUTrackingHIPExternalProvider)
target_compile_options(GPUTrackingHIPExternalProvider PRIVATE $<$<COMPILE_LANGUAGE:HIP>:-fgpu-rdc>)
Expand Down
4 changes: 4 additions & 0 deletions GPU/GPUTracking/Definitions/GPUSettingsList.h
Original file line number Diff line number Diff line change
Expand Up @@ -191,13 +191,15 @@ AddSubConfig(GPUSettingsRecTRD, trd)
AddHelp("help", 'h')
EndConfig()

#ifndef __OPENCL__
// Settings steering the processing once the device was selected, only available on the host
BeginSubConfig(GPUSettingsProcessingRTC, rtc, configStandalone.proc, "RTC", 0, "Processing settings", proc_rtc)
AddOption(cacheOutput, bool, false, "", 0, "Cache RTC compilation results")
AddOption(optConstexpr, bool, true, "", 0, "Replace constant variables by static constexpr expressions")
AddOption(compilePerKernel, bool, true, "", 0, "Run one RTC compilation per kernel")
AddOption(enable, bool, false, "", 0, "Use RTC to optimize GPU code")
AddOption(runTest, int, 0, "", 0, "Do not run the actual benchmark, but just test RTC compilation (1 full test, 2 test only compilation)")
AddOption(cacheMutex, bool, true, "", 0, "Use a file lock to serialize access to the cache folder")
AddHelp("help", 'h')
EndConfig()

Expand Down Expand Up @@ -276,11 +278,13 @@ AddOption(tpcSingleSector, int, -1, "", 0, "Restrict TPC processing to a single
AddOption(tpcDownscaledEdx, unsigned char, 0, "", 0, "If != 0, downscale dEdx processing (if enabled) to x %")
AddOption(tpcMaxAttachedClustersPerSectorRow, unsigned int, 51000, "", 0, "Maximum number of TPC attached clusters which can be decoded per SectorRow")
AddOption(tpcUseOldCPUDecoding, bool, false, "", 0, "Enable old CPU-based TPC decoding")
AddOption(RTCcacheFolder, std::string, "./rtccache/", "", 0, "Folder in which the cache file is stored")
AddVariable(eventDisplay, GPUCA_NAMESPACE::gpu::GPUDisplayFrontendInterface*, nullptr)
AddSubConfig(GPUSettingsProcessingRTC, rtc)
AddSubConfig(GPUSettingsProcessingParam, param)
AddHelp("help", 'h')
EndConfig()
#endif // __OPENCL__

#ifndef GPUCA_GPUCODE_DEVICE
// Light settings concerning the event display (can be changed without rebuilding vertices)
Expand Down
4 changes: 2 additions & 2 deletions GPU/GPUTracking/Standalone/Benchmark/standalone.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -297,8 +297,8 @@ int SetupReconstruction()
GPUSettingsGRP grp = rec->GetGRPSettings();
GPUSettingsRec recSet;
GPUSettingsProcessing procSet;
memcpy((void*)&recSet, (void*)&configStandalone.rec, sizeof(GPUSettingsRec));
memcpy((void*)&procSet, (void*)&configStandalone.proc, sizeof(GPUSettingsProcessing));
recSet = configStandalone.rec;
procSet = configStandalone.proc;
GPURecoStepConfiguration steps;

if (configStandalone.eventGenerator) {
Expand Down
1 change: 1 addition & 0 deletions GPU/GPUTracking/cmake/kernel_helpers.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ function(o2_gpu_add_kernel kernel_name kernel_files kernel_bounds kernel_type)
endif()
list(APPEND O2_GPU_KERNEL_TEMPLATE_FILES "${TMP_KERNEL_CLASS_FILE}.cxx")
list(REMOVE_DUPLICATES O2_GPU_KERNEL_TEMPLATE_FILES)
list(FILTER O2_GPU_KERNEL_TEMPLATE_FILES EXCLUDE REGEX "^-$")
list(TRANSFORM O2_GPU_KERNEL_TEMPLATE_FILES APPEND "\"")
list(TRANSFORM O2_GPU_KERNEL_TEMPLATE_FILES PREPEND "#include \"")
list(JOIN O2_GPU_KERNEL_TEMPLATE_FILES "\n" O2_GPU_KERNEL_TEMPLATE_FILES)
Expand Down
6 changes: 3 additions & 3 deletions GPU/Workflow/src/GPUWorkflowSpec.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,11 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
mode_t mask = S_IRUSR | S_IWUSR | S_IRGRP | S_IWGRP | S_IROTH | S_IWOTH;
fd = open("/tmp/o2_gpu_memlock_mutex.lock", O_RDWR | O_CREAT | O_CLOEXEC, mask);
if (fd == -1) {
throw std::runtime_error("Error opening lock file");
throw std::runtime_error("Error opening memlock mutex lock file");
}
fchmod(fd, mask);
if (lockf(fd, F_LOCK, 0)) {
throw std::runtime_error("Error locking file");
throw std::runtime_error("Error locking memlock mutex file");
}
}
std::chrono::time_point<std::chrono::high_resolution_clock> start, end;
Expand All @@ -342,7 +342,7 @@ void GPURecoWorkflowSpec::init(InitContext& ic)
}
if (mConfParam->mutexMemReg) {
if (lockf(fd, F_ULOCK, 0)) {
throw std::runtime_error("Error unlocking file");
throw std::runtime_error("Error unlocking memlock mutex file");
}
close(fd);
}
Expand Down
2 changes: 1 addition & 1 deletion dependencies/FindO2GPU.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ if(ENABLE_CUDA)
endif()
endif()
if(CMAKE_CUDA_COMPILER)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" --expt-relaxed-constexpr --extended-lambda --allow-unsupported-compiler -Xptxas -v -Wno-attributes")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler \"${O2_GPU_CMAKE_CXX_FLAGS_NOSTD}\" --expt-relaxed-constexpr --extended-lambda --allow-unsupported-compiler -Xptxas -v -Xcompiler -Wno-attributes")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL "12.3")
string(APPEND CMAKE_CUDA_FLAGS " -Xcudafe --diag_suppress=20257") # TODO: Cleanup
endif()
Expand Down
1 change: 1 addition & 0 deletions prodtests/full-system-test/dpl-workflow.sh
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,7 @@ if has_detector_calib PHS && workflow_has_parameter CALIB; then
fi

[[ ${O2_GPU_DOUBLE_PIPELINE:-$EPNSYNCMODE} == 1 ]] && GPU_CONFIG+=" --enableDoublePipeline"
[[ ${O2_GPU_RTC:-0} == 1 ]] && GPU_CONFIG_KEY+="GPU_proc_rtc.enable=1;GPU_proc_rtc.cacheOutput=1;GPU_proc.RTCcacheFolder=/tmp/o2_gpu_rtc_cache;"

( workflow_has_parameter AOD || [[ -z "$DISABLE_ROOT_OUTPUT" ]] || needs_root_output o2-emcal-cell-writer-workflow ) && has_detector EMC && RAW_EMC_SUBSPEC=" --subspecification 1 "
has_detector_reco MID && has_detector_matching MCHMID && MFTMCHConf="FwdMatching.useMIDMatch=true;" || MFTMCHConf="FwdMatching.useMIDMatch=false;"
Expand Down
7 changes: 2 additions & 5 deletions prodtests/full-system-test/start_tmux.sh
Original file line number Diff line number Diff line change
Expand Up @@ -97,19 +97,16 @@ fi

FST_SLEEP0=0
FST_SLEEP1=0
FST_SLEEP2=45
FST_SLEEP2=30
if [[ -z $SHM_MANAGER_SHMID ]]; then
rm -f /dev/shm/*fmq*
if [[ `ls /dev/shm/*fmq* 2> /dev/null | wc -l` != "0" ]]; then
echo "FMQ SHM files left which cannot be deleted, please clean up!"
exit 1
fi
else
FST_SLEEP0=0
FST_SLEEP1=0
FST_SLEEP2=30
fi
[[ ! -z $FST_TMUX_DD_WAIT ]] && FST_SLEEP2=$FST_TMUX_DD_WAIT
[[ ${O2_GPU_RTC:-0} == 1 ]] && FST_SLEEP2=60

if workflow_has_parameter CALIB_PROXIES; then
CALIB_COMMAND="$GEN_TOPO_MYDIR/aggregator-workflow.sh"
Expand Down
Loading