Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test GNN Examples and Add CUDA 12 Testing #4317

Merged
merged 27 commits into from
Apr 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
cc19d71
run on cuda 12, run examples for pyg
alexbarghi-nv Apr 4, 2024
4b0181a
fix merge conflict
alexbarghi-nv Apr 4, 2024
77353f9
fix shell script
alexbarghi-nv Apr 5, 2024
53dbfe8
switch dgl pytorch to conda-forge version
alexbarghi-nv Apr 5, 2024
e83f848
test only python files
alexbarghi-nv Apr 9, 2024
e496816
Merge branch 'branch-24.06' into test-gnn-examples
alexbarghi-nv Apr 9, 2024
e67d637
update pr
alexbarghi-nv Apr 9, 2024
531db1a
add pytorch to docs script
alexbarghi-nv Apr 9, 2024
697ff96
Merge branch 'branch-24.06' into test-gnn-examples
alexbarghi-nv Apr 9, 2024
cf2ce8f
remove separate dgl install
alexbarghi-nv Apr 9, 2024
3a0595e
Merge branch 'test-gnn-examples' of https://github.com/alexbarghi-nv/…
alexbarghi-nv Apr 9, 2024
169b85b
dgl
alexbarghi-nv Apr 9, 2024
ecf3ea4
revert nx change
alexbarghi-nv Apr 9, 2024
cad86ee
use pytorch channel instead of conda-forge
alexbarghi-nv Apr 10, 2024
6d35be0
switch back to conda-forge
alexbarghi-nv Apr 10, 2024
ff8a6c4
Merge branch 'branch-24.06' into test-gnn-examples
alexbarghi-nv Apr 10, 2024
966b068
update dependencies, test
alexbarghi-nv Apr 10, 2024
7a1f754
Merge branch 'test-gnn-examples' of https://github.com/alexbarghi-nv/…
alexbarghi-nv Apr 10, 2024
22ebdd0
style
alexbarghi-nv Apr 10, 2024
7690b66
revert nx change
alexbarghi-nv Apr 10, 2024
2eac8eb
graph sage mg
alexbarghi-nv Apr 11, 2024
0ad4e0d
fix graph sage sg bug
alexbarghi-nv Apr 11, 2024
2d22fdf
reformat
alexbarghi-nv Apr 11, 2024
a0a8a50
switch back to pytorch-cuda for PyG
alexbarghi-nv Apr 11, 2024
5a96d83
disable cuda 12
alexbarghi-nv Apr 11, 2024
4b4be90
fix syntax error in python script
alexbarghi-nv Apr 11, 2024
5d58d52
install from pyg url
alexbarghi-nv Apr 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 16 additions & 7 deletions ci/build_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,24 +20,33 @@ rapids-logger "Downloading artifacts from previous jobs"
CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)

if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
CONDA_CUDA_VERSION="11.8"
DGL_CHANNEL="dglteam/label/cu118"
else
CONDA_CUDA_VERSION="12.1"
DGL_CHANNEL="dglteam/label/cu121"
fi

rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
--channel conda-forge \
--channel pyg \
--channel nvidia \
--channel "${DGL_CHANNEL}" \
libcugraph \
pylibcugraph \
cugraph \
cugraph-pyg \
cugraph-dgl \
cugraph-service-server \
cugraph-service-client \
libcugraph_etl \
pylibcugraphops \
pylibwholegraph

# This command installs `cugraph-dgl` without its dependencies
# since this package can currently only run in `11.6` CTK environments
# due to the dependency version specifications in its conda recipe.
rapids-logger "Install cugraph-dgl"
rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2"
pylibwholegraph \
pytorch \
"cuda-version=${CONDA_CUDA_VERSION}"

export RAPIDS_VERSION="$(rapids-version)"
export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
Expand Down
6 changes: 6 additions & 0 deletions ci/run_cugraph_pyg_pytests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,9 @@ set -euo pipefail
cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg

pytest --cache-clear --ignore=tests/mg "$@" .

# Test examples
for e in "$(pwd)"/examples/*.py; do
rapids-logger "running example $e"
python $e
done
32 changes: 20 additions & 12 deletions ci/test_python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -154,8 +154,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
--channel pytorch \
--channel pytorch-nightly \
--channel conda-forge \
--channel dglteam/label/cu118 \
--channel nvidia \
libcugraph \
Expand All @@ -165,7 +164,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
cugraph-dgl \
'dgl>=1.1.0.cu*,<=2.0.0.cu*' \
'pytorch>=2.0' \
'pytorch-cuda>=11.8'
'cuda-version=11.8'

rapids-print-env

Expand All @@ -191,33 +190,43 @@ fi

if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
rapids-mamba-retry env create --yes -f env.yaml -n test_cugraph_pyg
rapids-mamba-retry env create --force -f env.yaml -n test_cugraph_pyg

# Temporarily allow unbound variables for conda activation.
set +u
conda activate test_cugraph_pyg
set -u

# TODO re-enable logic once CUDA 12 is testable
#if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
CONDA_CUDA_VERSION="11.8"
PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
#else
# CONDA_CUDA_VERSION="12.1"
# PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html"
#fi

# Will automatically install built dependencies of cuGraph-PyG
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
--channel pytorch \
--channel nvidia \
--channel pyg \
--channel rapidsai-nightly \
--channel nvidia \
"cugraph-pyg" \
"pytorch>=2.0,<2.1" \
"pytorch-cuda=11.8"
"pytorch=2.1.0" \
"pytorch-cuda=${CONDA_CUDA_VERSION}"

# Install pyg dependencies (which requires pip)

pip install ogb
pip install \
pyg_lib \
torch_scatter \
torch_sparse \
torch_cluster \
torch_spline_conv \
-f https://data.pyg.org/whl/torch-2.0.0+cu118.html
-f ${PYG_URL}

rapids-print-env

Expand All @@ -235,12 +244,11 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
conda deactivate
conda activate test
set -u

else
rapids-logger "skipping cugraph_pyg pytest on ARM64"
fi
else
rapids-logger "skipping cugraph_pyg pytest on CUDA != 11.8"
rapids-logger "skipping cugraph_pyg pytest on CUDA!=11.8"
fi

# test cugraph-equivariant
Expand All @@ -253,7 +261,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
rapids-mamba-retry install \
--channel "${CPP_CHANNEL}" \
--channel "${PYTHON_CHANNEL}" \
--channel pytorch \
--channel conda-forge \
--channel nvidia \
cugraph-equivariant
pip install e3nn==0.5.1
Expand Down
46 changes: 27 additions & 19 deletions ci/test_wheel_cugraph-pyg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,25 +25,33 @@ python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"

if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
rapids-logger "Installing PyTorch and PyG dependencies"
PYTORCH_URL="https://download.pytorch.org/whl/cu118"
rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
rapids-retry python -m pip install torch-geometric==2.4.0
rapids-retry python -m pip install \
pyg_lib \
torch_scatter \
torch_sparse \
torch_cluster \
torch_spline_conv \
-f https://data.pyg.org/whl/torch-2.1.0+cu118.html

rapids-logger "pytest cugraph-pyg (single GPU)"
pushd python/cugraph-pyg/cugraph_pyg
python -m pytest \
--cache-clear \
--ignore=tests/mg \
tests
popd
PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
else
rapids-logger "skipping cugraph-pyg wheel test on CUDA!=11.8"
PYTORCH_URL="https://download.pytorch.org/whl/cu121"
PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html"
fi
rapids-logger "Installing PyTorch and PyG dependencies"
rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
rapids-retry python -m pip install torch-geometric==2.4.0
rapids-retry python -m pip install \
ogb \
pyg_lib \
torch_scatter \
torch_sparse \
torch_cluster \
torch_spline_conv \
-f ${PYG_URL}

rapids-logger "pytest cugraph-pyg (single GPU)"
pushd python/cugraph-pyg/cugraph_pyg
python -m pytest \
--cache-clear \
--ignore=tests/mg \
tests
# Test examples
for e in "$(pwd)"/examples/*.py; do
rapids-logger "running example $e"
python $e
done
popd
1 change: 0 additions & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- pytorch
- pyg
- dglteam/label/cu118
- conda-forge
Expand Down
1 change: 0 additions & 1 deletion conda/environments/all_cuda-122_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- pytorch
- pyg
- dglteam/label/cu118
- conda-forge
Expand Down
1 change: 0 additions & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,6 @@ channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- pytorch
- pyg
- dglteam/label/cu118
- conda-forge
Expand Down
1 change: 0 additions & 1 deletion python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- pytorch
- pyg
- dglteam/label/cu118
- conda-forge
Expand Down
1 change: 0 additions & 1 deletion python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ channels:
- rapidsai
- rapidsai-nightly
- dask/label/dev
- pytorch
- pyg
- dglteam/label/cu118
- conda-forge
Expand Down
31 changes: 19 additions & 12 deletions python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import time
import argparse
import gc
import warnings

import torch
import numpy as np
Expand Down Expand Up @@ -405,27 +406,33 @@ def parse_args():
"--dask_scheduler_file",
type=str,
help="The path to the dask scheduler file",
required=True,
required=False,
default=None,
)

return parser.parse_args()


def main():
args = parse_args()
if args.dask_scheduler_file is None:
warnings.warn(
"You must provide the dask scheduler file " "to run this example. Exiting."
)

torch_devices = [int(d) for d in args.torch_devices.split(",")]

train_args = (
torch_devices,
args.torch_manager_ip,
args.torch_manager_port,
args.dask_scheduler_file,
args.num_epochs,
args.features_on_gpu,
)
else:
torch_devices = [int(d) for d in args.torch_devices.split(",")]

train_args = (
torch_devices,
args.torch_manager_ip,
args.torch_manager_port,
args.dask_scheduler_file,
args.num_epochs,
args.features_on_gpu,
)

tmp.spawn(train, args=train_args, nprocs=len(torch_devices))
tmp.spawn(train, args=train_args, nprocs=len(torch_devices))


if __name__ == "__main__":
Expand Down
82 changes: 42 additions & 40 deletions python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,10 +97,13 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -

num_papers = data[0]["num_nodes_dict"]["paper"]
train_perc = 0.1

train_nodes = torch.randperm(num_papers)
train_nodes = train_nodes[: int(train_perc * num_papers)]

train_mask = torch.full((num_papers,), -1, device=device)
train_mask[train_nodes] = 1

fs.add_data(train_mask, "paper", "train")

cugraph_store = CuGraphStore(fs, G, N)
Expand Down Expand Up @@ -128,47 +131,46 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
# barrier() cannot do this since the number of ops per rank is
# different. It essentially acts like barrier would if the
# number of ops per rank was the same.
for epoch in range(num_epochs):
for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
num_batches += 1
if iter_i % 20 == 0:
print(f"iteration {iter_i}")

# train
train_mask = hetero_data.train_dict["paper"]
y_true = hetero_data.y_dict["paper"]

y_pred = model(
hetero_data.x_dict["paper"].to(device).to(torch.float32),
hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device),
(len(y_true), len(y_true)),
)

y_true = F.one_hot(
y_true[train_mask].to(torch.int64), num_classes=349
).to(torch.float32)

y_pred = y_pred[train_mask]

loss = F.cross_entropy(y_pred, y_true)

optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()

del y_true
del y_pred
del loss
del hetero_data
gc.collect()

end_time_train = time.perf_counter_ns()
print(
f"epoch {epoch} time: "
f"{(end_time_train - start_time_train) / 1e9:3.4f} s"
for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
num_batches += 1
if iter_i % 20 == 0:
print(f"iteration {iter_i}")

# train
train_mask = hetero_data.train_dict["paper"]
y_true = hetero_data.y_dict["paper"]

y_pred = model(
hetero_data.x_dict["paper"].to(device).to(torch.float32),
hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device),
(len(y_true), len(y_true)),
)
print(f"loss after epoch {epoch}: {total_loss / num_batches}")

y_true = F.one_hot(y_true[train_mask].to(torch.int64), num_classes=349).to(
torch.float32
)

y_pred = y_pred[train_mask]

loss = F.cross_entropy(y_pred, y_true)

optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()

del y_true
del y_pred
del loss
del hetero_data
gc.collect()

end_time_train = time.perf_counter_ns()
print(
f"epoch {epoch} time: "
f"{(end_time_train - start_time_train) / 1e9:3.4f} s"
)
print(f"loss after epoch {epoch}: {total_loss / num_batches}")


def parse_args():
Expand Down
Loading