diff --git a/ci/build_docs.sh b/ci/build_docs.sh index 9b72c98f1bb..089558bd452 100755 --- a/ci/build_docs.sh +++ b/ci/build_docs.sh @@ -20,24 +20,33 @@ rapids-logger "Downloading artifacts from previous jobs" CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp) PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python) +if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then + CONDA_CUDA_VERSION="11.8" + DGL_CHANNEL="dglteam/label/cu118" +else + CONDA_CUDA_VERSION="12.1" + DGL_CHANNEL="dglteam/label/cu121" +fi + rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ + --channel conda-forge \ + --channel pyg \ + --channel nvidia \ + --channel "${DGL_CHANNEL}" \ libcugraph \ pylibcugraph \ cugraph \ cugraph-pyg \ + cugraph-dgl \ cugraph-service-server \ cugraph-service-client \ libcugraph_etl \ pylibcugraphops \ - pylibwholegraph - -# This command installs `cugraph-dgl` without its dependencies -# since this package can currently only run in `11.6` CTK environments -# due to the dependency version specifications in its conda recipe. -rapids-logger "Install cugraph-dgl" -rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2" + pylibwholegraph \ + pytorch \ + "cuda-version=${CONDA_CUDA_VERSION}" export RAPIDS_VERSION="$(rapids-version)" export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)" diff --git a/ci/run_cugraph_pyg_pytests.sh b/ci/run_cugraph_pyg_pytests.sh index 47ed6ba0008..0acc8aa462a 100755 --- a/ci/run_cugraph_pyg_pytests.sh +++ b/ci/run_cugraph_pyg_pytests.sh @@ -7,3 +7,9 @@ set -euo pipefail cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg pytest --cache-clear --ignore=tests/mg "$@" . + +# Test examples +for e in "$(pwd)"/examples/*.py; do + rapids-logger "running example $e" + python $e +done diff --git a/ci/test_python.sh b/ci/test_python.sh index a9198306c2f..dfcf6816dec 100755 --- a/ci/test_python.sh +++ b/ci/test_python.sh @@ -154,8 +154,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - --channel pytorch \ - --channel pytorch-nightly \ + --channel conda-forge \ --channel dglteam/label/cu118 \ --channel nvidia \ libcugraph \ @@ -165,7 +164,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then cugraph-dgl \ 'dgl>=1.1.0.cu*,<=2.0.0.cu*' \ 'pytorch>=2.0' \ - 'pytorch-cuda>=11.8' + 'cuda-version=11.8' rapids-print-env @@ -191,33 +190,43 @@ fi if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then - rapids-mamba-retry env create --yes -f env.yaml -n test_cugraph_pyg + rapids-mamba-retry env create --force -f env.yaml -n test_cugraph_pyg # Temporarily allow unbound variables for conda activation. set +u conda activate test_cugraph_pyg set -u + # TODO re-enable logic once CUDA 12 is testable + #if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then + CONDA_CUDA_VERSION="11.8" + PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html" + #else + # CONDA_CUDA_VERSION="12.1" + # PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html" + #fi + # Will automatically install built dependencies of cuGraph-PyG rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ --channel pytorch \ - --channel nvidia \ --channel pyg \ - --channel rapidsai-nightly \ + --channel nvidia \ "cugraph-pyg" \ - "pytorch>=2.0,<2.1" \ - "pytorch-cuda=11.8" + "pytorch=2.1.0" \ + "pytorch-cuda=${CONDA_CUDA_VERSION}" # Install pyg dependencies (which requires pip) + + pip install ogb pip install \ pyg_lib \ torch_scatter \ torch_sparse \ torch_cluster \ torch_spline_conv \ - -f https://data.pyg.org/whl/torch-2.0.0+cu118.html + -f ${PYG_URL} rapids-print-env @@ -235,12 +244,11 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then conda deactivate conda activate test set -u - else rapids-logger "skipping cugraph_pyg pytest on ARM64" fi else - rapids-logger "skipping cugraph_pyg pytest on CUDA != 11.8" + rapids-logger "skipping cugraph_pyg pytest on CUDA!=11.8" fi # test cugraph-equivariant @@ -253,7 +261,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then rapids-mamba-retry install \ --channel "${CPP_CHANNEL}" \ --channel "${PYTHON_CHANNEL}" \ - --channel pytorch \ + --channel conda-forge \ --channel nvidia \ cugraph-equivariant pip install e3nn==0.5.1 diff --git a/ci/test_wheel_cugraph-pyg.sh b/ci/test_wheel_cugraph-pyg.sh index 50cbfb3e1fe..ad615f0b3ff 100755 --- a/ci/test_wheel_cugraph-pyg.sh +++ b/ci/test_wheel_cugraph-pyg.sh @@ -25,25 +25,33 @@ python -m pip install $(ls ./dist/${python_package_name}*.whl)[test] export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)" if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then - rapids-logger "Installing PyTorch and PyG dependencies" PYTORCH_URL="https://download.pytorch.org/whl/cu118" - rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL} - rapids-retry python -m pip install torch-geometric==2.4.0 - rapids-retry python -m pip install \ - pyg_lib \ - torch_scatter \ - torch_sparse \ - torch_cluster \ - torch_spline_conv \ - -f https://data.pyg.org/whl/torch-2.1.0+cu118.html - - rapids-logger "pytest cugraph-pyg (single GPU)" - pushd python/cugraph-pyg/cugraph_pyg - python -m pytest \ - --cache-clear \ - --ignore=tests/mg \ - tests - popd + PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html" else - rapids-logger "skipping cugraph-pyg wheel test on CUDA!=11.8" + PYTORCH_URL="https://download.pytorch.org/whl/cu121" + PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html" fi +rapids-logger "Installing PyTorch and PyG dependencies" +rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL} +rapids-retry python -m pip install torch-geometric==2.4.0 +rapids-retry python -m pip install \ + ogb \ + pyg_lib \ + torch_scatter \ + torch_sparse \ + torch_cluster \ + torch_spline_conv \ + -f ${PYG_URL} + +rapids-logger "pytest cugraph-pyg (single GPU)" +pushd python/cugraph-pyg/cugraph_pyg +python -m pytest \ + --cache-clear \ + --ignore=tests/mg \ + tests +# Test examples +for e in "$(pwd)"/examples/*.py; do + rapids-logger "running example $e" + python $e +done +popd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index f656df9e32b..f3aeedad641 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -4,7 +4,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev -- pytorch - pyg - dglteam/label/cu118 - conda-forge diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml index 2bd0746e49c..4d64ce33a4e 100644 --- a/conda/environments/all_cuda-122_arch-x86_64.yaml +++ b/conda/environments/all_cuda-122_arch-x86_64.yaml @@ -4,7 +4,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev -- pytorch - pyg - dglteam/label/cu118 - conda-forge diff --git a/dependencies.yaml b/dependencies.yaml index 85fb1344ce9..3fa19eb238a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -285,7 +285,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev - - pytorch - pyg - dglteam/label/cu118 - conda-forge diff --git a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml index feafe57246c..bc8bf776a1e 100644 --- a/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml +++ b/python/cugraph-dgl/conda/cugraph_dgl_dev_cuda-118.yaml @@ -4,7 +4,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev -- pytorch - pyg - dglteam/label/cu118 - conda-forge diff --git a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml index 81c53044a1a..94e9f1decbd 100644 --- a/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml +++ b/python/cugraph-pyg/conda/cugraph_pyg_dev_cuda-118.yaml @@ -4,7 +4,6 @@ channels: - rapidsai - rapidsai-nightly - dask/label/dev -- pytorch - pyg - dglteam/label/cu118 - conda-forge diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py index 4ca573504a1..80d683e6c79 100644 --- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py +++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_mg.py @@ -17,6 +17,7 @@ import time import argparse import gc +import warnings import torch import numpy as np @@ -405,7 +406,8 @@ def parse_args(): "--dask_scheduler_file", type=str, help="The path to the dask scheduler file", - required=True, + required=False, + default=None, ) return parser.parse_args() @@ -413,19 +415,24 @@ def parse_args(): def main(): args = parse_args() + if args.dask_scheduler_file is None: + warnings.warn( + "You must provide the dask scheduler file " "to run this example. Exiting." + ) - torch_devices = [int(d) for d in args.torch_devices.split(",")] - - train_args = ( - torch_devices, - args.torch_manager_ip, - args.torch_manager_port, - args.dask_scheduler_file, - args.num_epochs, - args.features_on_gpu, - ) + else: + torch_devices = [int(d) for d in args.torch_devices.split(",")] + + train_args = ( + torch_devices, + args.torch_manager_ip, + args.torch_manager_port, + args.dask_scheduler_file, + args.num_epochs, + args.features_on_gpu, + ) - tmp.spawn(train, args=train_args, nprocs=len(torch_devices)) + tmp.spawn(train, args=train_args, nprocs=len(torch_devices)) if __name__ == "__main__": diff --git a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py index 9c96a707e4d..58a403084df 100644 --- a/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py +++ b/python/cugraph-pyg/cugraph_pyg/examples/graph_sage_sg.py @@ -97,10 +97,13 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) - num_papers = data[0]["num_nodes_dict"]["paper"] train_perc = 0.1 + train_nodes = torch.randperm(num_papers) train_nodes = train_nodes[: int(train_perc * num_papers)] + train_mask = torch.full((num_papers,), -1, device=device) train_mask[train_nodes] = 1 + fs.add_data(train_mask, "paper", "train") cugraph_store = CuGraphStore(fs, G, N) @@ -128,47 +131,46 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) - # barrier() cannot do this since the number of ops per rank is # different. It essentially acts like barrier would if the # number of ops per rank was the same. - for epoch in range(num_epochs): - for iter_i, hetero_data in enumerate(cugraph_bulk_loader): - num_batches += 1 - if iter_i % 20 == 0: - print(f"iteration {iter_i}") - - # train - train_mask = hetero_data.train_dict["paper"] - y_true = hetero_data.y_dict["paper"] - - y_pred = model( - hetero_data.x_dict["paper"].to(device).to(torch.float32), - hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device), - (len(y_true), len(y_true)), - ) - - y_true = F.one_hot( - y_true[train_mask].to(torch.int64), num_classes=349 - ).to(torch.float32) - - y_pred = y_pred[train_mask] - - loss = F.cross_entropy(y_pred, y_true) - - optimizer.zero_grad() - loss.backward() - optimizer.step() - total_loss += loss.item() - - del y_true - del y_pred - del loss - del hetero_data - gc.collect() - - end_time_train = time.perf_counter_ns() - print( - f"epoch {epoch} time: " - f"{(end_time_train - start_time_train) / 1e9:3.4f} s" + for iter_i, hetero_data in enumerate(cugraph_bulk_loader): + num_batches += 1 + if iter_i % 20 == 0: + print(f"iteration {iter_i}") + + # train + train_mask = hetero_data.train_dict["paper"] + y_true = hetero_data.y_dict["paper"] + + y_pred = model( + hetero_data.x_dict["paper"].to(device).to(torch.float32), + hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device), + (len(y_true), len(y_true)), ) - print(f"loss after epoch {epoch}: {total_loss / num_batches}") + + y_true = F.one_hot(y_true[train_mask].to(torch.int64), num_classes=349).to( + torch.float32 + ) + + y_pred = y_pred[train_mask] + + loss = F.cross_entropy(y_pred, y_true) + + optimizer.zero_grad() + loss.backward() + optimizer.step() + total_loss += loss.item() + + del y_true + del y_pred + del loss + del hetero_data + gc.collect() + + end_time_train = time.perf_counter_ns() + print( + f"epoch {epoch} time: " + f"{(end_time_train - start_time_train) / 1e9:3.4f} s" + ) + print(f"loss after epoch {epoch}: {total_loss / num_batches}") def parse_args():