rapidsai · rapids-bot · Apr 12, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 5, 2024
@@ -20,24 +20,33 @@ rapids-logger "Downloading artifacts from previous jobs"
 CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 
+if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
+  CONDA_CUDA_VERSION="11.8"
+  DGL_CHANNEL="dglteam/label/cu118"
+else
+  CONDA_CUDA_VERSION="12.1"
+  DGL_CHANNEL="dglteam/label/cu121"
+fi
+
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
+  --channel conda-forge \
+  --channel pyg \
+  --channel nvidia \
+  --channel "${DGL_CHANNEL}" \
   libcugraph \
   pylibcugraph \
   cugraph \
   cugraph-pyg \
+  cugraph-dgl \
   cugraph-service-server \
   cugraph-service-client \
   libcugraph_etl \
   pylibcugraphops \
-  pylibwholegraph
-
-# This command installs `cugraph-dgl` without its dependencies
-# since this package can currently only run in `11.6` CTK environments
-# due to the dependency version specifications in its conda recipe.
-rapids-logger "Install cugraph-dgl"
-rapids-mamba-retry install "${PYTHON_CHANNEL}/linux-64/cugraph-dgl-*.tar.bz2"
+  pylibwholegraph \
+  pytorch \
+  "cuda-version=${CONDA_CUDA_VERSION}"
 
 export RAPIDS_VERSION="$(rapids-version)"
 export RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"

@@ -7,3 +7,9 @@ set -euo pipefail
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cugraph-pyg/cugraph_pyg
 
 pytest --cache-clear --ignore=tests/mg "$@" .
+
+# Test examples
+for e in "$(pwd)"/examples/*.py; do
+  rapids-logger "running example $e"
+  python $e
+done
@@ -154,8 +154,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     rapids-mamba-retry install \
       --channel "${CPP_CHANNEL}" \
       --channel "${PYTHON_CHANNEL}" \
-      --channel pytorch \
-      --channel pytorch-nightly \
+      --channel conda-forge \
       --channel dglteam/label/cu118 \
       --channel nvidia \
       libcugraph \
@@ -165,7 +164,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
       cugraph-dgl \
       'dgl>=1.1.0.cu*,<=2.0.0.cu*' \
       'pytorch>=2.0' \
-      'pytorch-cuda>=11.8'
+      'cuda-version=11.8'
 
     rapids-print-env
 
@@ -191,33 +190,43 @@ fi
 
 if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
   if [[ "${RUNNER_ARCH}" != "ARM64" ]]; then
-    rapids-mamba-retry env create --yes -f env.yaml -n test_cugraph_pyg
+    rapids-mamba-retry env create --force -f env.yaml -n test_cugraph_pyg
 
     # Temporarily allow unbound variables for conda activation.
     set +u
     conda activate test_cugraph_pyg
     set -u
 
+    # TODO re-enable logic once CUDA 12 is testable
+    #if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
+    CONDA_CUDA_VERSION="11.8"
+    PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
+    #else
+    #  CONDA_CUDA_VERSION="12.1"
+    #  PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html"
+    #fi
+
     # Will automatically install built dependencies of cuGraph-PyG
     rapids-mamba-retry install \
       --channel "${CPP_CHANNEL}" \
       --channel "${PYTHON_CHANNEL}" \
       --channel pytorch \
-      --channel nvidia \
       --channel pyg \
-      --channel rapidsai-nightly \
+      --channel nvidia \
       "cugraph-pyg" \
-      "pytorch>=2.0,<2.1" \
-      "pytorch-cuda=11.8"
+      "pytorch=2.1.0" \
+      "pytorch-cuda=${CONDA_CUDA_VERSION}"
 
     # Install pyg dependencies (which requires pip)
+
+    pip install ogb
     pip install \
         pyg_lib \
         torch_scatter \
         torch_sparse \
         torch_cluster \
         torch_spline_conv \
-      -f https://data.pyg.org/whl/torch-2.0.0+cu118.html
+      -f ${PYG_URL}
 
     rapids-print-env
 
@@ -235,12 +244,11 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     conda deactivate
     conda activate test
     set -u
-
   else
     rapids-logger "skipping cugraph_pyg pytest on ARM64"
   fi
 else
-  rapids-logger "skipping cugraph_pyg pytest on CUDA != 11.8"
+  rapids-logger "skipping cugraph_pyg pytest on CUDA!=11.8"
 fi
 
 # test cugraph-equivariant
@@ -253,7 +261,7 @@ if [[ "${RAPIDS_CUDA_VERSION}" == "11.8.0" ]]; then
     rapids-mamba-retry install \
       --channel "${CPP_CHANNEL}" \
       --channel "${PYTHON_CHANNEL}" \
-      --channel pytorch \
+      --channel conda-forge \
       --channel nvidia \
       cugraph-equivariant
     pip install e3nn==0.5.1

@@ -25,25 +25,33 @@ python -m pip install $(ls ./dist/${python_package_name}*.whl)[test]
 export RAPIDS_DATASET_ROOT_DIR="$(realpath datasets)"
 
 if [[ "${CUDA_VERSION}" == "11.8.0" ]]; then
-  rapids-logger "Installing PyTorch and PyG dependencies"
   PYTORCH_URL="https://download.pytorch.org/whl/cu118"
-  rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
-  rapids-retry python -m pip install torch-geometric==2.4.0
-  rapids-retry python -m pip install \
-    pyg_lib \
-    torch_scatter \
-    torch_sparse \
-    torch_cluster \
-    torch_spline_conv \
-    -f https://data.pyg.org/whl/torch-2.1.0+cu118.html
-
-  rapids-logger "pytest cugraph-pyg (single GPU)"
-  pushd python/cugraph-pyg/cugraph_pyg
-  python -m pytest \
-    --cache-clear \
-    --ignore=tests/mg \
-    tests
-  popd
+  PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu118.html"
 else
-  rapids-logger "skipping cugraph-pyg wheel test on CUDA!=11.8"
+  PYTORCH_URL="https://download.pytorch.org/whl/cu121"
+  PYG_URL="https://data.pyg.org/whl/torch-2.1.0+cu121.html"
 fi
+rapids-logger "Installing PyTorch and PyG dependencies"
+rapids-retry python -m pip install torch==2.1.0 --index-url ${PYTORCH_URL}
+rapids-retry python -m pip install torch-geometric==2.4.0
+rapids-retry python -m pip install \
+  ogb \
+  pyg_lib \
+  torch_scatter \
+  torch_sparse \
+  torch_cluster \
+  torch_spline_conv \
+  -f ${PYG_URL}
+
+rapids-logger "pytest cugraph-pyg (single GPU)"
+pushd python/cugraph-pyg/cugraph_pyg
+python -m pytest \
+  --cache-clear \
+  --ignore=tests/mg \
+  tests
+# Test examples
+for e in "$(pwd)"/examples/*.py; do
+  rapids-logger "running example $e"
+  python $e
+done
+popd
@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- pytorch
 - pyg
 - dglteam/label/cu118
 - conda-forge

@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- pytorch
 - pyg
 - dglteam/label/cu118
 - conda-forge

diff --git a/dependencies.yaml b/dependencies.yaml
@@ -285,7 +285,6 @@ channels:
   - rapidsai
   - rapidsai-nightly
   - dask/label/dev
-  - pytorch
   - pyg
   - dglteam/label/cu118
   - conda-forge

@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- pytorch
 - pyg
 - dglteam/label/cu118
 - conda-forge

@@ -4,7 +4,6 @@ channels:
 - rapidsai
 - rapidsai-nightly
 - dask/label/dev
-- pytorch
 - pyg
 - dglteam/label/cu118
 - conda-forge

@@ -17,6 +17,7 @@
 import time
 import argparse
 import gc
+import warnings
 
 import torch
 import numpy as np
@@ -405,27 +406,33 @@ def parse_args():
         "--dask_scheduler_file",
         type=str,
         help="The path to the dask scheduler file",
-        required=True,
+        required=False,
+        default=None,
     )
 
     return parser.parse_args()
 
 
 def main():
     args = parse_args()
+    if args.dask_scheduler_file is None:
+        warnings.warn(
+            "You must provide the dask scheduler file " "to run this example.  Exiting."
+        )
 
-    torch_devices = [int(d) for d in args.torch_devices.split(",")]
-
-    train_args = (
-        torch_devices,
-        args.torch_manager_ip,
-        args.torch_manager_port,
-        args.dask_scheduler_file,
-        args.num_epochs,
-        args.features_on_gpu,
-    )
+    else:
+        torch_devices = [int(d) for d in args.torch_devices.split(",")]
+
+        train_args = (
+            torch_devices,
+            args.torch_manager_ip,
+            args.torch_manager_port,
+            args.dask_scheduler_file,
+            args.num_epochs,
+            args.features_on_gpu,
+        )
 
-    tmp.spawn(train, args=train_args, nprocs=len(torch_devices))
+        tmp.spawn(train, args=train_args, nprocs=len(torch_devices))
 
 
 if __name__ == "__main__":

@@ -97,10 +97,13 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
 
     num_papers = data[0]["num_nodes_dict"]["paper"]
     train_perc = 0.1
+
     train_nodes = torch.randperm(num_papers)
     train_nodes = train_nodes[: int(train_perc * num_papers)]
+
     train_mask = torch.full((num_papers,), -1, device=device)
     train_mask[train_nodes] = 1
+
     fs.add_data(train_mask, "paper", "train")
 
     cugraph_store = CuGraphStore(fs, G, N)
@@ -128,47 +131,46 @@ def train(device: int, features_device: Union[str, int] = "cpu", num_epochs=2) -
         # barrier() cannot do this since the number of ops per rank is
         # different.  It essentially acts like barrier would if the
         # number of ops per rank was the same.
-        for epoch in range(num_epochs):
-            for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
-                num_batches += 1
-                if iter_i % 20 == 0:
-                    print(f"iteration {iter_i}")
-
-                # train
-                train_mask = hetero_data.train_dict["paper"]
-                y_true = hetero_data.y_dict["paper"]
-
-                y_pred = model(
-                    hetero_data.x_dict["paper"].to(device).to(torch.float32),
-                    hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device),
-                    (len(y_true), len(y_true)),
-                )
-
-                y_true = F.one_hot(
-                    y_true[train_mask].to(torch.int64), num_classes=349
-                ).to(torch.float32)
-
-                y_pred = y_pred[train_mask]
-
-                loss = F.cross_entropy(y_pred, y_true)
-
-                optimizer.zero_grad()
-                loss.backward()
-                optimizer.step()
-                total_loss += loss.item()
-
-                del y_true
-                del y_pred
-                del loss
-                del hetero_data
-                gc.collect()
-
-            end_time_train = time.perf_counter_ns()
-            print(
-                f"epoch {epoch} time: "
-                f"{(end_time_train - start_time_train) / 1e9:3.4f} s"
+        for iter_i, hetero_data in enumerate(cugraph_bulk_loader):
+            num_batches += 1
+            if iter_i % 20 == 0:
+                print(f"iteration {iter_i}")
+
+            # train
+            train_mask = hetero_data.train_dict["paper"]
+            y_true = hetero_data.y_dict["paper"]
+
+            y_pred = model(
+                hetero_data.x_dict["paper"].to(device).to(torch.float32),
+                hetero_data.edge_index_dict[("paper", "cites", "paper")].to(device),
+                (len(y_true), len(y_true)),
             )
-            print(f"loss after epoch {epoch}: {total_loss / num_batches}")
+
+            y_true = F.one_hot(y_true[train_mask].to(torch.int64), num_classes=349).to(
+                torch.float32
+            )
+
+            y_pred = y_pred[train_mask]
+
+            loss = F.cross_entropy(y_pred, y_true)
+
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            total_loss += loss.item()
+
+            del y_true
+            del y_pred
+            del loss
+            del hetero_data
+            gc.collect()
+
+        end_time_train = time.perf_counter_ns()
+        print(
+            f"epoch {epoch} time: "
+            f"{(end_time_train - start_time_train) / 1e9:3.4f} s"
+        )
+        print(f"loss after epoch {epoch}: {total_loss / num_batches}")
 
 
 def parse_args():