Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Failing sanity checks of Gromacs on Zen4/H100 #241

Open
laraPPr opened this issue Feb 14, 2025 · 3 comments
Open

Failing sanity checks of Gromacs on Zen4/H100 #241

laraPPr opened this issue Feb 14, 2025 · 3 comments
Labels
bug Something isn't working

Comments

@laraPPr
Copy link
Collaborator

laraPPr commented Feb 14, 2025

          "build_jobid": null,
          "build_stderr": null,
          "build_stdout": null,
          "dependencies_actual": [],
          "dependencies_conceptual": [],
          "environ": "default",
          "fail_phase": "sanity",
          "fail_reason": "sanity error: -3377560.0 is beyond reference value -3270800.0 (l=-3274070.8, u=-3267529.2)",
          "filename": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps/gromacs.py",
          "fixture": false,
          "job_completion_time": "2025-02-12T18:22:29+01:00",
          "job_completion_time_unix": 1739380949.0,
          "job_stderr": "rfm_job.err",
          "job_stdout": "rfm_job.out",
          "partition": "litleo",
          "result": "fail",
          "run_index": 0,
          "scheduler": "slurm",
          "session_uuid": "51454cd1-6870-418f-b969-8432d0218c06",
          "time_compile": 0.004049062728881836,
          "time_performance": null,
          "time_run": 1595.5709257125854,
          "time_sanity": 0.0031495094299316406,
          "time_setup": 0.014080286026000977,
          "time_total": 3290.0120844841003,
          "uuid": "51454cd1-6870-418f-b969-8432d0218c06:0:46",
          "fail_info": {
            "exc_type": "SanityError",
            "exc_value": "-3377560.0 is beyond reference value -3270800.0 (l=-3274070.8, u=-3267529.2)",
            "traceback": [
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/frontend/executors/__init__.py\", line 383, in _safe_call\n    return fn(*args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/hooks.py\", line 109, in _fn\n    func(obj, *args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2149, in sanity\n    self.check_sanity()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2210, in check_sanity\n    success = sn.evaluate(self.sanity_patterns)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 929, in evaluate\n    return expr.evaluate(cache=cache)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 78, in evaluate\n    ret = ret.evaluate()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 77, in all\n    return builtins.all(iterable)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 89, in __bool__\n    return builtins.bool(self.evaluate())\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 585, in assert_reference\n    raise SanityError(_format(error_msg, val, ref, lower, upper)) from None\n"
            ]
          },
          "fail_severe": false,
          "EESSI_CONFIGS_BRANCH": "v0.5.1",
          "EESSI_CONFIGS_URL": "https://github.com/EESSI/test-suite.git",
          "job_exitcode": 0,
          "job_nodelist": [
            "node4304.litleo.os"
          ],
          "job_submit_time": 1739379355.021807,
          "jobid": "43001294",
          "system": "litleo",
          "bench_name": "HECBioSim/hEGFRDimerSmallerPL",
          "benchmark_info": "HECBioSim/hEGFRDimerSmallerPL",
          "benchmark_version": "1.0.0",
          "build_locally": true,
          "build_time_limit": null,
          "cvmfs_repo_name": "None",
          "cvmfs_software_subdir": "['']",
          "descr": "GROMACS HECBioSim/hEGFRDimerSmallerPL benchmark (NB: gpu)",
          "display_name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimerSmallerPL %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.3-foss-2023b-CUDA-12.4.0",
          "eessi_testsuite_version": "0.5.1",
          "env_vars": {
            "OMP_NUM_THREADS": 24
          },
          "exact_memory": false,
          "exclusive_access": false,
          "executable": "gmx_mpi mdrun",
          "executable_opts": [
            "-nb",
            "gpu",
            "-s",
            "benchmark.tpr",
            "-dlb",
            "yes",
            "-npme",
            "-1",
            "-ntomp",
            "24"
          ],
          "extra_resources": {
            "memory": {
              "size": "157920M"
            },
            "_rfm_gpu": {
              "num_gpus_per_node": 1
            }
          },
          "full_modulepath": "['/apps/gent/RHEL9/zen4-hopper-ib/modules/all/GROMACS/2024.3-foss-2023b-CUDA-12.4.0.lua']",
          "hashcode": "d0992d39",
          "keep_files": [
            "md.log"
          ],
          "local": false,
          "maintainers": [],
          "max_pending_time": null,
          "measure_memory_usage": false,
          "module_name": "GROMACS/2024.3-foss-2023b-CUDA-12.4.0",
          "modules": [
            "GROMACS/2024.3-foss-2023b-CUDA-12.4.0"
          ],
          "name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimerSmallerPL %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.3-foss-2023b-CUDA-12.4.0",
          "nb_impl": "gpu",
          "num_cpus_per_task": 24,
          "num_gpus_per_node": 1,
          "num_tasks": 1,
          "num_tasks_per_core": null,
          "num_tasks_per_node": 1,
          "num_tasks_per_socket": null,
          "outputdir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/output/litleo/litleo/default/EESSI_GROMACS_d0992d39",
          "perfvalues": {
            "litleo:litleo:perf": [
              null,
              0,
              null,
              null,
              "ns/day"
            ]
          },
          "postbuild_cmds": [],
          "postrun_cmds": [
            "echo \"EESSI_CVMFS_REPO: $EESSI_CVMFS_REPO\"",
            "echo \"EESSI_SOFTWARE_SUBDIR: $EESSI_SOFTWARE_SUBDIR\"",
            "echo \"FULL_MODULEPATH: $(module --location show GROMACS/2024.3-foss-2023b-CUDA-12.4.0)\""
          ],
          "prebuild_cmds": [],
          "prefix": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps",
          "prerun_cmds": [
            "curl -LJO https://github.com/victorusu/GROMACS_Benchmark_Suite/raw/1.0.0/HECBioSim/hEGFRDimerSmallerPL/benchmark.tpr"
          ],
          "readonly_files": [
            ""
          ],
          "scale": "1_2_node",
          "short_name": "EESSI_GROMACS_d0992d39",
          "sourcepath": "",
          "sourcesdir": null,
          "stagedir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/stage/litleo/litleo/default/EESSI_GROMACS_d0992d39",
          "strict_check": true,
          "tags": [
            "chemistry",
            "sciapp",
            "1_2_node"
          ],
          "time_limit": 1800.0,
          "unique_name": "EESSI_GROMACS_0883",
          "use_multithreading": null,
          "valid_prog_environs": [
            "default"
          ],
          "valid_systems": [
            "+1_2_node +gpu %GPU_VENDOR=nvidia"
          ],
          "variables": {
            "OMP_NUM_THREADS": 24
          }
        },
        {
          "build_jobid": null,
          "build_stderr": null,
          "build_stdout": null,
          "dependencies_actual": [],
          "dependencies_conceptual": [],
          "environ": "default",
          "fail_phase": "sanity",
          "fail_reason": "sanity error: -3377070.0 is beyond reference value -3270800.0 (l=-3274070.8, u=-3267529.2)",
          "filename": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps/gromacs.py",
          "fixture": false,
          "job_completion_time": "2025-02-12T18:25:07+01:00",
          "job_completion_time_unix": 1739381107.0,
          "job_stderr": "rfm_job.err",
          "job_stdout": "rfm_job.out",
          "partition": "litleo",
          "result": "fail",
          "run_index": 0,
          "scheduler": "slurm",
          "session_uuid": "51454cd1-6870-418f-b969-8432d0218c06",
          "time_compile": 0.004175424575805664,
          "time_performance": null,
          "time_run": 1706.6625003814697,
          "time_sanity": 0.005077362060546875,
          "time_setup": 0.014171361923217773,
          "time_total": 3447.268796682358,
          "uuid": "51454cd1-6870-418f-b969-8432d0218c06:0:47",
          "fail_info": {
            "exc_type": "SanityError",
            "exc_value": "-3377070.0 is beyond reference value -3270800.0 (l=-3274070.8, u=-3267529.2)",
            "traceback": [
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/frontend/executors/__init__.py\", line 383, in _safe_call\n    return fn(*args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/hooks.py\", line 109, in _fn\n    func(obj, *args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2149, in sanity\n    self.check_sanity()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2210, in check_sanity\n    success = sn.evaluate(self.sanity_patterns)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 929, in evaluate\n    return expr.evaluate(cache=cache)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 78, in evaluate\n    ret = ret.evaluate()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 77, in all\n    return builtins.all(iterable)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 89, in __bool__\n    return builtins.bool(self.evaluate())\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 585, in assert_reference\n    raise SanityError(_format(error_msg, val, ref, lower, upper)) from None\n"
            ]
          },
          "fail_severe": false,
          "EESSI_CONFIGS_BRANCH": "v0.5.1",
          "EESSI_CONFIGS_URL": "https://github.com/EESSI/test-suite.git",
          "job_exitcode": 0,
          "job_nodelist": [
            "node4301.litleo.os"
          ],
          "job_submit_time": 1739379401.2127714,
          "jobid": "43001295",
          "system": "litleo",
          "bench_name": "HECBioSim/hEGFRDimerSmallerPL",
          "benchmark_info": "HECBioSim/hEGFRDimerSmallerPL",
          "benchmark_version": "1.0.0",
          "build_locally": true,
          "build_time_limit": null,
          "cvmfs_repo_name": "None",
          "cvmfs_software_subdir": "['']",
          "descr": "GROMACS HECBioSim/hEGFRDimerSmallerPL benchmark (NB: gpu)",
          "display_name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimerSmallerPL %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.2-foss-2023b-CUDA-12.5.0",
          "eessi_testsuite_version": "0.5.1",
          "env_vars": {
            "OMP_NUM_THREADS": 24
          },
          "exact_memory": false,
          "exclusive_access": false,
          "executable": "gmx_mpi mdrun",
          "executable_opts": [
            "-nb",
            "gpu",
            "-s",
            "benchmark.tpr",
            "-dlb",
            "yes",
            "-npme",
            "-1",
            "-ntomp",
            "24"
          ],
          "extra_resources": {
            "memory": {
              "size": "157920M"
            },
            "_rfm_gpu": {
              "num_gpus_per_node": 1
            }
          },
          "full_modulepath": "['/apps/gent/RHEL9/zen4-hopper-ib/modules/all/GROMACS/2024.2-foss-2023b-CUDA-12.5.0.lua']",
          "hashcode": "7d10a983",
          "keep_files": [
            "md.log"
          ],
          "local": false,
          "maintainers": [],
          "max_pending_time": null,
          "measure_memory_usage": false,
          "module_name": "GROMACS/2024.2-foss-2023b-CUDA-12.5.0",
          "modules": [
            "GROMACS/2024.2-foss-2023b-CUDA-12.5.0"
          ],
          "name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimerSmallerPL %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.2-foss-2023b-CUDA-12.5.0",
          "nb_impl": "gpu",
          "num_cpus_per_task": 24,
          "num_gpus_per_node": 1,
          "num_tasks": 1,
          "num_tasks_per_core": null,
          "num_tasks_per_node": 1,
          "num_tasks_per_socket": null,
          "outputdir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/output/litleo/litleo/default/EESSI_GROMACS_7d10a983",
          "perfvalues": {
            "litleo:litleo:perf": [
              null,
              0,
              null,
              null,
              "ns/day"
            ]
          },
          "postbuild_cmds": [],
          "postrun_cmds": [
            "echo \"EESSI_CVMFS_REPO: $EESSI_CVMFS_REPO\"",
            "echo \"EESSI_SOFTWARE_SUBDIR: $EESSI_SOFTWARE_SUBDIR\"",
            "echo \"FULL_MODULEPATH: $(module --location show GROMACS/2024.2-foss-2023b-CUDA-12.5.0)\""
          ],
          "prebuild_cmds": [],
          "prefix": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps",
          "prerun_cmds": [
            "curl -LJO https://github.com/victorusu/GROMACS_Benchmark_Suite/raw/1.0.0/HECBioSim/hEGFRDimerSmallerPL/benchmark.tpr"
          ],
          "readonly_files": [
            ""
          ],
          "scale": "1_2_node",
          "short_name": "EESSI_GROMACS_7d10a983",
          "sourcepath": "",
          "sourcesdir": null,
          "stagedir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/stage/litleo/litleo/default/EESSI_GROMACS_7d10a983",
          "strict_check": true,
          "tags": [
            "chemistry",
            "sciapp",
            "1_2_node"
          ],
          "time_limit": 1800.0,
          "unique_name": "EESSI_GROMACS_0882",
          "use_multithreading": null,
          "valid_prog_environs": [
            "default"
          ],
          "valid_systems": [
            "+1_2_node +gpu %GPU_VENDOR=nvidia"
          ],
          "variables": {
            "OMP_NUM_THREADS": 24
          }
        },
        {
          "build_jobid": null,
          "build_stderr": null,
          "build_stdout": null,
          "dependencies_actual": [],
          "dependencies_conceptual": [],
          "environ": "default",
          "fail_phase": "sanity",
          "fail_reason": "sanity error: -3433780.0 is beyond reference value -3328920.0 (l=-3332248.9199999995, u=-3325591.08)",
          "filename": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps/gromacs.py",
          "fixture": false,
          "job_completion_time": "2025-02-12T18:28:00+01:00",
          "job_completion_time_unix": 1739381280.0,
          "job_stderr": "rfm_job.err",
          "job_stdout": "rfm_job.out",
          "partition": "litleo",
          "result": "fail",
          "run_index": 0,
          "scheduler": "slurm",
          "session_uuid": "51454cd1-6870-418f-b969-8432d0218c06",
          "time_compile": 0.00406336784362793,
          "time_performance": null,
          "time_run": 1864.7826979160309,
          "time_sanity": 0.003868579864501953,
          "time_setup": 0.014059066772460938,
          "time_total": 3621.679725408554,
          "uuid": "51454cd1-6870-418f-b969-8432d0218c06:0:48",
          "fail_info": {
            "exc_type": "SanityError",
            "exc_value": "-3433780.0 is beyond reference value -3328920.0 (l=-3332248.9199999995, u=-3325591.08)",
            "traceback": [
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/frontend/executors/__init__.py\", line 383, in _safe_call\n    return fn(*args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/hooks.py\", line 109, in _fn\n    func(obj, *args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2149, in sanity\n    self.check_sanity()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2210, in check_sanity\n    success = sn.evaluate(self.sanity_patterns)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 929, in evaluate\n    return expr.evaluate(cache=cache)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 78, in evaluate\n    ret = ret.evaluate()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 77, in all\n    return builtins.all(iterable)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 89, in __bool__\n    return builtins.bool(self.evaluate())\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 585, in assert_reference\n    raise SanityError(_format(error_msg, val, ref, lower, upper)) from None\n"
            ]
          },
          "fail_severe": false,
          "EESSI_CONFIGS_BRANCH": "v0.5.1",
          "EESSI_CONFIGS_URL": "https://github.com/EESSI/test-suite.git",
          "job_exitcode": 0,
          "job_nodelist": [
            "node4307.litleo.os"
          ],
          "job_submit_time": 1739379417.5243123,
          "jobid": "43001296",
          "system": "litleo",
          "bench_name": "HECBioSim/hEGFRDimer",
          "benchmark_info": "HECBioSim/hEGFRDimer",
          "benchmark_version": "1.0.0",
          "build_locally": true,
          "build_time_limit": null,
          "cvmfs_repo_name": "None",
          "cvmfs_software_subdir": "['']",
          "descr": "GROMACS HECBioSim/hEGFRDimer benchmark (NB: gpu)",
          "display_name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimer %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.3-foss-2023b-CUDA-12.4.0",
          "eessi_testsuite_version": "0.5.1",
          "env_vars": {
            "OMP_NUM_THREADS": 24
          },
          "exact_memory": false,
          "exclusive_access": false,
          "executable": "gmx_mpi mdrun",
          "executable_opts": [
            "-nb",
            "gpu",
            "-s",
            "benchmark.tpr",
            "-dlb",
            "yes",
            "-npme",
            "-1",
            "-ntomp",
            "24"
          ],
          "extra_resources": {
            "memory": {
              "size": "157920M"
            },
            "_rfm_gpu": {
              "num_gpus_per_node": 1
            }
          },
          "full_modulepath": "['/apps/gent/RHEL9/zen4-hopper-ib/modules/all/GROMACS/2024.3-foss-2023b-CUDA-12.4.0.lua']",
          "hashcode": "f904246a",
          "keep_files": [
            "md.log"
          ],
          "local": false,
          "maintainers": [],
          "max_pending_time": null,
          "measure_memory_usage": false,
          "module_name": "GROMACS/2024.3-foss-2023b-CUDA-12.4.0",
          "modules": [
            "GROMACS/2024.3-foss-2023b-CUDA-12.4.0"
          ],
          "name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimer %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.3-foss-2023b-CUDA-12.4.0",
          "nb_impl": "gpu",
          "num_cpus_per_task": 24,
          "num_gpus_per_node": 1,
          "num_tasks": 1,
          "num_tasks_per_core": null,
          "num_tasks_per_node": 1,
          "num_tasks_per_socket": null,
          "outputdir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/output/litleo/litleo/default/EESSI_GROMACS_f904246a",
          "perfvalues": {
            "litleo:litleo:perf": [
              null,
              0,
              null,
              null,
              "ns/day"
            ]
          },
          "postbuild_cmds": [],
          "postrun_cmds": [
            "echo \"EESSI_CVMFS_REPO: $EESSI_CVMFS_REPO\"",
            "echo \"EESSI_SOFTWARE_SUBDIR: $EESSI_SOFTWARE_SUBDIR\"",
            "echo \"FULL_MODULEPATH: $(module --location show GROMACS/2024.3-foss-2023b-CUDA-12.4.0)\""
          ],
          "prebuild_cmds": [],
          "prefix": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps",
          "prerun_cmds": [
            "curl -LJO https://github.com/victorusu/GROMACS_Benchmark_Suite/raw/1.0.0/HECBioSim/hEGFRDimer/benchmark.tpr"
          ],
          "readonly_files": [
            ""
          ],
          "scale": "1_2_node",
          "short_name": "EESSI_GROMACS_f904246a",
          "sourcepath": "",
          "sourcesdir": null,
          "stagedir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/stage/litleo/litleo/default/EESSI_GROMACS_f904246a",
          "strict_check": true,
          "tags": [
            "chemistry",
            "sciapp",
            "1_2_node"
          ],
          "time_limit": 1800.0,
          "unique_name": "EESSI_GROMACS_0649",
          "use_multithreading": null,
          "valid_prog_environs": [
            "default"
          ],
          "valid_systems": [
            "+1_2_node +gpu %GPU_VENDOR=nvidia"
          ],
          "variables": {
            "OMP_NUM_THREADS": 24
          }
        },
        {
          "build_jobid": null,
          "build_stderr": null,
          "build_stdout": null,
          "dependencies_actual": [],
          "dependencies_conceptual": [],
          "environ": "default",
          "fail_phase": "sanity",
          "fail_reason": "sanity error: -3434760.0 is beyond reference value -3328920.0 (l=-3332248.9199999995, u=-3325591.08)",
          "filename": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps/gromacs.py",
          "fixture": false,
          "job_completion_time": "2025-02-12T18:28:05+01:00",
          "job_completion_time_unix": 1739381285.0,
          "job_stderr": "rfm_job.err",
          "job_stdout": "rfm_job.out",
          "partition": "litleo",
          "result": "fail",
          "run_index": 0,
          "scheduler": "slurm",
          "session_uuid": "51454cd1-6870-418f-b969-8432d0218c06",
          "time_compile": 0.0040400028228759766,
          "time_performance": null,
          "time_run": 1858.3001391887665,
          "time_sanity": 0.0030927658081054688,
          "time_setup": 0.014045238494873047,
          "time_total": 3625.207576274872,
          "uuid": "51454cd1-6870-418f-b969-8432d0218c06:0:49",
          "fail_info": {
            "exc_type": "SanityError",
            "exc_value": "-3434760.0 is beyond reference value -3328920.0 (l=-3332248.9199999995, u=-3325591.08)",
            "traceback": [
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/frontend/executors/__init__.py\", line 383, in _safe_call\n    return fn(*args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/hooks.py\", line 109, in _fn\n    func(obj, *args, **kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2149, in sanity\n    self.check_sanity()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/pipeline.py\", line 2210, in check_sanity\n    success = sn.evaluate(self.sanity_patterns)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 929, in evaluate\n    return expr.evaluate(cache=cache)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 78, in evaluate\n    ret = ret.evaluate()\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 77, in all\n    return builtins.all(iterable)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 89, in __bool__\n    return builtins.bool(self.evaluate())\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/core/deferrable.py\", line 73, in evaluate\n    ret = self._fn(*fn_args, **fn_kwargs)\n",
              "  File \"/user/gent/461/vsc46128/rfm.Txsh5gIMTo/reframe/reframe/utility/sanity.py\", line 585, in assert_reference\n    raise SanityError(_format(error_msg, val, ref, lower, upper)) from None\n"
            ]
          },
          "fail_severe": false,
          "EESSI_CONFIGS_BRANCH": "v0.5.1",
          "EESSI_CONFIGS_URL": "https://github.com/EESSI/test-suite.git",
          "job_exitcode": 0,
          "job_nodelist": [
            "node4304.litleo.os"
          ],
          "job_submit_time": 1739379427.5582094,
          "jobid": "43001297",
          "system": "litleo",
          "bench_name": "HECBioSim/hEGFRDimer",
          "benchmark_info": "HECBioSim/hEGFRDimer",
          "benchmark_version": "1.0.0",
          "build_locally": true,
          "build_time_limit": null,
          "cvmfs_repo_name": "None",
          "cvmfs_software_subdir": "['']",
          "descr": "GROMACS HECBioSim/hEGFRDimer benchmark (NB: gpu)",
          "display_name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimer %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.2-foss-2023b-CUDA-12.5.0",
          "eessi_testsuite_version": "0.5.1",
          "env_vars": {
            "OMP_NUM_THREADS": 24
          },
          "exact_memory": false,
          "exclusive_access": false,
          "executable": "gmx_mpi mdrun",
          "executable_opts": [
            "-nb",
            "gpu",
            "-s",
            "benchmark.tpr",
            "-dlb",
            "yes",
            "-npme",
            "-1",
            "-ntomp",
            "24"
          ],
          "extra_resources": {
            "memory": {
              "size": "157920M"
            },
            "_rfm_gpu": {
              "num_gpus_per_node": 1
            }
          },
          "full_modulepath": "['/apps/gent/RHEL9/zen4-hopper-ib/modules/all/GROMACS/2024.2-foss-2023b-CUDA-12.5.0.lua']",
          "hashcode": "2f92651e",
          "keep_files": [
            "md.log"
          ],
          "local": false,
          "maintainers": [],
          "max_pending_time": null,
          "measure_memory_usage": false,
          "module_name": "GROMACS/2024.2-foss-2023b-CUDA-12.5.0",
          "modules": [
            "GROMACS/2024.2-foss-2023b-CUDA-12.5.0"
          ],
          "name": "EESSI_GROMACS %benchmark_info=HECBioSim/hEGFRDimer %nb_impl=gpu %scale=1_2_node %module_name=GROMACS/2024.2-foss-2023b-CUDA-12.5.0",
          "nb_impl": "gpu",
          "num_cpus_per_task": 24,
          "num_gpus_per_node": 1,
          "num_tasks": 1,
          "num_tasks_per_core": null,
          "num_tasks_per_node": 1,
          "num_tasks_per_socket": null,
          "outputdir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/output/litleo/litleo/default/EESSI_GROMACS_2f92651e",
          "perfvalues": {
            "litleo:litleo:perf": [
              null,
              0,
              null,
              null,
              "ns/day"
            ]
          },
          "postbuild_cmds": [],
          "postrun_cmds": [
            "echo \"EESSI_CVMFS_REPO: $EESSI_CVMFS_REPO\"",
            "echo \"EESSI_SOFTWARE_SUBDIR: $EESSI_SOFTWARE_SUBDIR\"",
            "echo \"FULL_MODULEPATH: $(module --location show GROMACS/2024.2-foss-2023b-CUDA-12.5.0)\""
          ],
          "prebuild_cmds": [],
          "prefix": "/kyukon/home/gent/461/vsc46128/rfm.Txsh5gIMTo/test-suite/eessi/testsuite/tests/apps",
          "prerun_cmds": [
            "curl -LJO https://github.com/victorusu/GROMACS_Benchmark_Suite/raw/1.0.0/HECBioSim/hEGFRDimer/benchmark.tpr"
          ],
          "readonly_files": [
            ""
          ],
          "scale": "1_2_node",
          "short_name": "EESSI_GROMACS_2f92651e",
          "sourcepath": "",
          "sourcesdir": null,
          "stagedir": "/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/stage/litleo/litleo/default/EESSI_GROMACS_2f92651e",
          "strict_check": true,
          "tags": [
            "chemistry",
            "sciapp",
            "1_2_node"
          ],
          "time_limit": 1800.0,
          "unique_name": "EESSI_GROMACS_0648",
          "use_multithreading": null,
          "valid_prog_environs": [
            "default"
          ],
          "valid_systems": [
            "+1_2_node +gpu %GPU_VENDOR=nvidia"
          ],
          "variables": {
            "OMP_NUM_THREADS": 24
          }
        },
@laraPPr laraPPr added the bug Something isn't working label Feb 14, 2025
@laraPPr
Copy link
Collaborator Author

laraPPr commented Feb 14, 2025

@casparvl Could you also test you local stack and see if you are seeing the same on your zen4/H100 cluster?

@laraPPr
Copy link
Collaborator Author

laraPPr commented Feb 26, 2025

@smoors rfm_job.out of one of the failing jobs:

  1                       :-) GROMACS - gmx mdrun, 2024.3 (-:

  2 

  3 Executable:   /apps/gent/RHEL9/zen4-hopper-ib/software/GROMACS/2024.3-foss-2023b-CUDA-12.4.0/bin/gmx_mpi

  4 Data prefix:  /apps/gent/RHEL9/zen4-hopper-ib/software/GROMACS/2024.3-foss-2023b-CUDA-12.4.0

  5 Working dir:  /kyukon/data/gent/vo/000/gvo00002/vsc46128/ReFrame_EESSI_CI/reframe_prefix/stage/litleo/litleo/default/EESSI_GROMACS_d0992d39

  6 Command line:

  7   gmx_mpi mdrun -nb gpu -s benchmark.tpr -dlb yes -npme -1 -ntomp 24

  8 

  9 Reading file benchmark.tpr, VERSION 2020.4 (single precision)

 10 Note: file tpx version 119, software tpx version 133

 11 GPU-aware MPI detected, but by default GROMACS will not make use the direct GPU communication capabilities of MPI. For improved performance try enabling the feature by setting the GMX_ENABLE_DIRECT_GPU_COMM environment variable.

 12 Changing nstlist from 10 to 100, rlist from 0.9 to 1.053

 13 

 14 

 15 Update groups can not be used for this system because there are three or more consecutively coupled constraints

 16 

 17 1 GPU selected for this run.

 18 Mapping of GPU IDs to the 2 GPU tasks in the 1 rank on this node:

 19   PP:0,PME:0

 20 PP tasks will do (non-perturbed) short-ranged interactions on the GPU

 21 PP task will update and constrain coordinates on the GPU

 22 PME tasks will do all aspects on the GPU

 23 Using 1 MPI process

 24 

 25 Non-default thread affinity set, disabling internal thread affinity

 26 

 27 Using 24 OpenMP threads

 28 

 29 starting mdrun 'Her1-Her1'

 30 10000 steps,     20.0 ps.

 31 

 32 Writing final coordinates.

 33 

 34                Core t (s)   Wall t (s)        (%)

 35        Time:     6786.948      282.822     2399.7

 36                  (ns/day)    (hour/ns)

 37 Performance:        6.110        3.928

 38 

 39 GROMACS reminds you: "Home computers are being called upon to perform many new functions, including the consumption of homework formerly eaten by the dog." (Doug Larson)

 40 

 41 EESSI_CVMFS_REPO:

 42 EESSI_SOFTWARE_SUBDIR:

 43 FULL_MODULEPATH: /apps/gent/RHEL9/zen4-hopper-ib/modules/all/GROMACS/2024.3-foss-2023b-CUDA-12.4.0.lua


@smoors
Copy link
Collaborator

smoors commented Feb 26, 2025

@laraPPr can you post the md.log files somewhere so we can check the energies?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants