Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding gpu aware mpi to osu-micro-benchmarks w/ rocm variant #581

Open
wants to merge 46 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
7ac5171
initial commit of osu-micro-benchmarks experiment.py class
Aug 23, 2024
34fffeb
fixing workload name
Aug 23, 2024
d7b8eff
adding ability to configure workloads through spec
Aug 23, 2024
1b1887b
removed unnecessary variant
Sep 3, 2024
a4cc7ed
Merge branch 'develop' into experiments/osu-micro-benchmarks
pearce8 Sep 23, 2024
5717f06
lint
pearce8 Sep 23, 2024
d1ffb31
lint
pearce8 Sep 24, 2024
7556fee
lint
pearce8 Sep 24, 2024
8296c22
Merge branch 'develop' into experiments/osu-micro-benchmarks
pearce8 Sep 24, 2024
af08a21
Merge branch 'LLNL:develop' into experiments/osu-micro-benchmarks
august-knox Oct 28, 2024
c77da2e
fixing experiment class
Oct 30, 2024
d310741
lint
Oct 30, 2024
c009e45
lint
Oct 30, 2024
8928a5a
adding dryrun
Oct 30, 2024
3770038
fixing dry run
Oct 30, 2024
4933893
moving experiment class to new loaction
Nov 4, 2024
57f6a9d
progress on updating osu-microbenchmarks
Nov 13, 2024
4601fe8
updating experiment class
Nov 13, 2024
3f33a7e
Delete var/exp_repo/experiments/osu-micro-benchmarks directory
august-knox Nov 13, 2024
8f3ead4
Merge branch 'develop' into experiments/osu-micro-benchmarks
august-knox Nov 13, 2024
a858c6f
Merge branch 'develop' into experiments/osu-micro-benchmarks
pearce8 Nov 15, 2024
65183bc
removing tab in dryrun
Nov 15, 2024
6e04bab
removing other tabs in run.yml
Nov 15, 2024
a2b6d25
lint
Nov 15, 2024
87011e3
license
Nov 15, 2024
4c8a714
adding prefix to benchpark command
Nov 15, 2024
f3e4da9
removing redundant system instantiation
Nov 15, 2024
a437887
updating scaling formatting
Nov 25, 2024
3862b5f
updating dryruns
Dec 2, 2024
10ba803
Merge remote-tracking branch 'origin/develop' into experiments/osu-mi…
rfhaque Dec 11, 2024
e725f35
resolving merge conflicts
Dec 20, 2024
209c7ee
fixing merge conflict
Jan 15, 2025
7f3a69f
initial commit for gpu support
Jan 21, 2025
7fb9243
Merge branch 'develop' of github.com:LLNL/benchpark into osu-micro-be…
Jan 24, 2025
287a9de
Merge branch 'develop' of github.com:LLNL/benchpark into osu-micro-be…
Jan 24, 2025
c660d31
initial commit for gpu-aware mpi osu-micro-benchmarks
Jan 28, 2025
7e934bd
Merge branch 'develop' of github.com:LLNL/benchpark into osu-micro-be…
Jan 28, 2025
dd2df1d
adding dryrun
Jan 28, 2025
f1c59de
lint
Jan 28, 2025
7709c37
lint
Jan 28, 2025
88b363b
removing nused import
Jan 28, 2025
ce07785
removing extra runtime env variable
Feb 3, 2025
f178f4b
Merge branch 'develop' into osu-micro-benchmarks
august-knox Feb 3, 2025
32bd40b
Merge branch 'develop' into osu-micro-benchmarks
august-knox Feb 3, 2025
4aeeacf
changing local_rank to system specific variable
Feb 15, 2025
830bcc2
adding gtl_flag attribute
Feb 15, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions .github/workflows/run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -431,15 +431,6 @@ jobs:
system_name: ruby
system_spec: llnl-cluster cluster=ruby compiler=intel

- name: osu-micro-benchmarks/mpi ruby llnl-cluster cluster=ruby compiler=intel
uses: ./.github/actions/dynamic-dry-run
with:
benchmark_name: osu-micro-benchmarks
benchmark_mode: mpi
benchmark_spec: osu-micro-benchmarks workload=all
system_name: ruby
system_spec: llnl-cluster cluster=ruby compiler=intel

- name: laghos/mpi caliper=mpi,time ruby llnl-cluster cluster=ruby compiler=intel
uses: ./.github/actions/dynamic-dry-run
with:
Expand Down Expand Up @@ -799,3 +790,21 @@ jobs:
benchmark_spec: stream caliper=mpi,time
system_name: tioga
system_spec: llnl-elcapitan rocm=5.5.1 compiler=cce

- name: osu-micro-benchmarks/mpi ruby llnl-cluster cluster=ruby compiler=intel
uses: ./.github/actions/dynamic-dry-run
with:
benchmark_name: osu-micro-benchmarks
benchmark_mode: mpi
benchmark_spec: osu-micro-benchmarks workload=all
system_name: ruby
system_spec: llnl-cluster cluster=ruby compiler=intel

- name: osu-micro-benchmarks/rocm tioga llnl-elcapitan cluster=tioga compiler=cce +gtl
uses: ./.github/actions/dynamic-dry-run
with:
benchmark_name: osu-micro-benchmarks
benchmark_mode: rocm
benchmark_spec: osu-micro-benchmarks+rocm workload=all
system_name: tioga
system_spec: llnl-elcapitan cluster=tioga compiler=cce +gtl
33 changes: 22 additions & 11 deletions experiments/osu-micro-benchmarks/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,16 @@
# SPDX-License-Identifier: Apache-2.0

from benchpark.directives import variant
from benchpark.error import BenchparkError
from benchpark.experiment import Experiment
from benchpark.rocm import ROCmExperiment
from benchpark.cuda import CudaExperiment


class OsuMicroBenchmarks(Experiment):
class OsuMicroBenchmarks(
Experiment,
ROCmExperiment,
CudaExperiment,
):

variant(
"workload",
Expand Down Expand Up @@ -90,27 +95,33 @@ class OsuMicroBenchmarks(Experiment):
)

def compute_applications_section(self):
scaling_modes = {
"single_node": self.spec.satisfies("+single_node"),
}

scaling_mode_enabled = [key for key, value in scaling_modes.items() if value]
if len(scaling_mode_enabled) != 1:
raise BenchparkError(
f"Only one type of scaling per experiment is allowed for application package {self.name}"
)

num_nodes = {"n_nodes": 2}

if self.spec.satisfies("+single_node"):
for pk, pv in num_nodes.items():
self.add_experiment_variable(pk, pv, True)

if self.spec.satisfies("+rocm"):
self.add_experiment_variable("additional_args", " -d rocm", False)
if self.spec.satisfies("+cuda"):
self.add_experiment_variable("additional_args", " -d cuda", False)
if self.spec.satisfies("+rocm") or self.spec.satisfies("+cuda"):
for pk, pv in num_nodes.items():
self.add_experiment_variable("n_gpus", pv, True)

def compute_spack_section(self):
system_specs = {}
if self.spec.satisfies("+cuda"):
system_specs["cuda_version"] = "{default_cuda_version}"
system_specs["cuda_arch"] = "{cuda_arch}"
elif self.spec.satisfies("+rocm"):
system_specs["rocm_arch"] = "{rocm_arch}"

system_specs["compiler"] = "default-compiler"
system_specs["mpi"] = "default-mpi"
self.add_spack_spec(system_specs["mpi"])

self.add_spack_spec(
self.name, ["osu-micro-benchmarks", system_specs["compiler"]]
)
29 changes: 29 additions & 0 deletions repo/osu-micro-benchmarks/package.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Copyright 2023 Lawrence Livermore National Security, LLC and other
# Benchpark Project Developers. See the top-level COPYRIGHT file for details.
#
# SPDX-License-Identifier: Apache-2.0

from spack.package import *
from spack.pkg.builtin.osu_micro_benchmarks import OsuMicroBenchmarks as BuiltinOsu


class OsuMicroBenchmarks(BuiltinOsu, ROCmPackage):

depends_on("cray-mpich+gtl", when="+rocm")

def configure_args(self):
args = super().configure_args()
if self.spec.satisfies("+rocm"):
args.extend([f"LDFLAGS={self.spec['mpi'].libs.ld_flags}"])
print(self.spec['mpi'])
return args

def setup_run_environment(self, env):
mpidir = join_path(self.prefix.libexec, "osu-micro-benchmarks", "mpi")
env.prepend_path("PATH", join_path(mpidir, "startup"))
env.prepend_path("PATH", join_path(mpidir, "pt2pt"))
env.prepend_path("PATH", join_path(mpidir, "one-sided"))
env.prepend_path("PATH", join_path(mpidir, "collective"))
if self.spec.satisfies("+rocm"):
if self.spec.satisfies("^cray-mpich+gtl"):
env.prepend_path("LOCAL_RANK", self.spec['mpi'].extra_attributes['gtl_flags'])
1 change: 1 addition & 0 deletions systems/llnl-elcapitan/system.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,6 +160,7 @@ def mpi_config(self, cce_version):
"""

use_gtl = f"""\
gtl_flags: $MV2_COMM_WORLD_LOCAL_RANK
gtl_cutoff_size: 4096
fi_cxi_ats: 0
gtl_lib_path: /opt/cray/pe/mpich/{mpi_version}/gtl/lib
Expand Down