From c7a2e6e348e99a2ac7ca170c4c2d7b1eddc46a94 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 10 Jun 2024 11:06:39 +0200 Subject: [PATCH 1/5] Change smile for failed test runs --- bot/check-test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bot/check-test.sh b/bot/check-test.sh index f045b9500a..3b16e5c415 100755 --- a/bot/check-test.sh +++ b/bot/check-test.sh @@ -101,7 +101,7 @@ elif [[ ${ERROR} -eq 1 ]]; then reason="EESSI test suite was not run, test step itself failed to execute." status="FAILURE" else - summary=":grin: FAILURE" + summary=":cry: FAILURE" reason="Failed for unknown reason" status="FAILURE" fi From 911737d2bea6cea92c09262bcfa605e18ba553eb Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 13 Jun 2024 14:05:46 +0200 Subject: [PATCH 2/5] Add template field for mem per node --- reframe_config_bot.py.tmpl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/reframe_config_bot.py.tmpl b/reframe_config_bot.py.tmpl index 0cc3e9f530..607373767a 100644 --- a/reframe_config_bot.py.tmpl +++ b/reframe_config_bot.py.tmpl @@ -34,6 +34,11 @@ site_configuration = { 'options': ['--mem={size}'], } ], + 'extras': { + # Make sure to round down, otherwise a job might ask for more mem than is available + # per node + 'mem_per_node': __MEM_PER_NODE__, + }, 'max_jobs': 1 } ] From e10b227d3299a0945edd5ecede13b9784c4fd748 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 13 Jun 2024 14:14:36 +0200 Subject: [PATCH 3/5] Get memory limit for cgroup of current job / UID and put this into the ReFrame config file --- test_suite.sh | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index 95eb9daa2a..5c157f7a47 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -135,7 +135,7 @@ export RFM_PREFIX=$PWD/reframe_runs echo "Configured reframe with the following environment variables:" env | grep "RFM_" -# Inject correct CPU properties into the ReFrame config file +# Inject correct CPU/memory properties into the ReFrame config file cpuinfo=$(lscpu) if [[ "${cpuinfo}" =~ CPU\(s\):[^0-9]*([0-9]+) ]]; then cpu_count=${BASH_REMATCH[1]} @@ -157,11 +157,19 @@ if [[ "${cpuinfo}" =~ (Core\(s\) per socket:[^0-9]*([0-9]+)) ]]; then else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi +cgroup_mem_bytes=$(cat /sys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes) +if [[ $? -eq 0 ]] + # Convert to MiB + cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024))) +else + fatal_error "Failed to get the memory limit in bytes from the current cgroup" +fi cp ${RFM_CONFIG_FILE_TEMPLATE} ${RFM_CONFIG_FILES} sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES +sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib/g" $RFM_CONFIG_FILES # Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966 export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now From 644eddcb66a46871cdfe93ef54ca70954169f548 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 13 Jun 2024 14:16:02 +0200 Subject: [PATCH 4/5] Missing bracket --- test_suite.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index 5c157f7a47..2d072d3d95 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -169,7 +169,7 @@ sed -i "s/__NUM_CPUS__/${cpu_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_SOCKETS__/${socket_count}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_CORE__/${threads_per_core}/g" $RFM_CONFIG_FILES sed -i "s/__NUM_CPUS_PER_SOCKET__/${cores_per_socket}/g" $RFM_CONFIG_FILES -sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib/g" $RFM_CONFIG_FILES +sed -i "s/__MEM_PER_NODE__/${cgroup_mem_mib}/g" $RFM_CONFIG_FILES # Workaround for https://github.com/EESSI/software-layer/pull/467#issuecomment-1973341966 export PSM3_DEVICES='self,shm' # this is enough, since we only run single node for now From bcba10e3553806f4f1dda6d27412f4756464f13c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 13 Jun 2024 14:16:48 +0200 Subject: [PATCH 5/5] Fix if-else syntax --- test_suite.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_suite.sh b/test_suite.sh index 2d072d3d95..46b43ee78a 100755 --- a/test_suite.sh +++ b/test_suite.sh @@ -158,7 +158,7 @@ else fatal_error "Failed to get the number of cores per socket for the current test hardware with lscpu." fi cgroup_mem_bytes=$(cat /sys/fs/cgroup/memory/slurm/uid_${UID}/job_${SLURM_JOB_ID}/memory.limit_in_bytes) -if [[ $? -eq 0 ]] +if [[ $? -eq 0 ]]; then # Convert to MiB cgroup_mem_mib=$((cgroup_mem_bytes/(1024*1024))) else