rapidsai · rapids-bot · May 10, 2023 · Apr 17, 2023 · Apr 17, 2023 · Apr 17, 2023
@@ -209,14 +209,12 @@ def __iter__(self):
         output_dir = os.path.join(
             self._sampling_output_dir, "epoch_" + str(self.epoch_number)
         )
-        rank = self._rank
         bs = BulkSampler(
             output_path=output_dir,
             batch_size=self._batch_size,
             graph=self._cugraph_graph,
             batches_per_partition=self._batches_per_partition,
             seeds_per_call=self._seeds_per_call,
-            rank=rank,
             fanout_vals=self.graph_sampler._reversed_fanout_vals,
             with_replacement=self.graph_sampler.replace,
         )
@@ -226,7 +224,6 @@ def __iter__(self):
         batch_df = create_batch_df(self.tensorized_indices_ds)
         bs.add_batches(batch_df, start_col_name="start", batch_col_name="batch_id")
         bs.flush()
-        output_dir = output_dir + f"/rank={rank}/"
         self.cugraph_dgl_dataset.set_input_files(input_directory=output_dir)
         self.epoch_number = self.epoch_number + 1
         return super().__iter__()

@@ -420,8 +420,6 @@ def __construct_graph(
             {
                 "src": pandas.Series(na_src),
                 "dst": pandas.Series(na_dst),
-                "w": pandas.Series(np.zeros(len(na_src))),
-                "eid": pandas.Series(np.arange(len(na_src))),
                 "etp": pandas.Series(na_etp),
             }
         )
@@ -441,15 +439,15 @@ def __construct_graph(
                 df,
                 source="src",
                 destination="dst",
-                edge_attr=["w", "eid", "etp"],
+                edge_type="etp",
             )
             distributed.get_client().publish_dataset(cugraph_graph=graph)
         else:
             graph.from_cudf_edgelist(
                 df,
                 source="src",
                 destination="dst",
-                edge_attr=["w", "eid", "etp"],
+                edge_type="etp",
             )
 
         return graph

@@ -14,6 +14,7 @@
 import tempfile
 
 import os
+import re
 
 import cupy
 import cudf
@@ -31,6 +32,9 @@
 
 
 class EXPERIMENTAL__BulkSampleLoader:
+
+    __ex_parquet_file = re.compile(r"batch=([0-9]+)\-([0-9]+)\.parquet")
+
     def __init__(
         self,
         feature_store: CuGraphStore,
@@ -40,7 +44,6 @@ def __init__(
         shuffle=False,
         edge_types: Sequence[Tuple[str]] = None,
         directory=None,
-        rank=0,
         starting_batch_id=0,
         batches_per_partition=100,
         # Sampler args
@@ -84,10 +87,6 @@ def __init__(
             The path of the directory to write samples to.
             Defaults to a new generated temporary directory.
 
-        rank: int (optional, default=0)
-            The rank of the current worker.  Should be provided
-            when there are multiple workers.
-
         starting_batch_id: int (optional, default=0)
             The starting id for each batch.  Defaults to 0.
             Generally used when loading previously-sampled
@@ -102,16 +101,16 @@ def __init__(
 
         self.__feature_store = feature_store
         self.__graph_store = graph_store
-        self.__rank = rank
-        self.__next_batch = starting_batch_id
-        self.__end_exclusive = starting_batch_id
+        self.__next_batch = -1
+        self.__end_exclusive = -1
         self.__batches_per_partition = batches_per_partition
         self.__starting_batch_id = starting_batch_id
 
         if isinstance(all_indices, int):
             # Will be loading from disk
             self.__num_batches = all_indices
             self.__directory = directory
+            iter(os.listdir(self.__directory))
             return
 
         if batch_size is None or batch_size < 1:
@@ -123,7 +122,6 @@ def __init__(
             batch_size,
             self.__directory.name,
             self.__graph_store._subgraph(edge_types),
-            rank=rank,
             fanout_vals=num_neighbors,
             with_replacement=replace,
             batches_per_partition=self.__batches_per_partition,
@@ -161,33 +159,36 @@ def __init__(
             )
 
         bulk_sampler.flush()
+        self.__input_files = iter(os.listdir(self.__directory.name))
 
     def __next__(self):
-        # Quit iterating if there are no batches left
-        if self.__next_batch >= self.__num_batches + self.__starting_batch_id:
-            raise StopIteration
-
         # Load the next set of sampling results if necessary
         if self.__next_batch >= self.__end_exclusive:
+            if self.__directory is None:
+                raise StopIteration
+
             # Read the next parquet file into memory
             dir_path = (
                 self.__directory
                 if isinstance(self.__directory, str)
                 else self.__directory.name
             )
-            rank_path = os.path.join(dir_path, f"rank={self.__rank}")
 
-            file_end_batch_incl = min(
-                self.__end_exclusive + self.__batches_per_partition - 1,
-                self.__starting_batch_id + self.__num_batches - 1,
-            )
+            # Will raise StopIteration if there are no files left
+            fname = next(self.__input_files)
+
+            m = self.__ex_parquet_file.match(fname)
+            if m is None:
+                raise ValueError(f"Invalid parquet filename {fname}")
+
+            self.__next_batch, end_inclusive = [int(g) for g in m.groups()]
+            self.__end_exclusive = end_inclusive + 1
+
             parquet_path = os.path.join(
-                rank_path,
-                f"batch={self.__end_exclusive}" f"-{file_end_batch_incl}.parquet",
+                dir_path,
+                fname,
             )
 
-            self.__end_exclusive += self.__batches_per_partition
-
             columns = {
                 "sources": "int64",
                 "destinations": "int64",
@@ -212,6 +213,7 @@ def __next__(self):
         if self.__next_batch >= self.__num_batches + self.__starting_batch_id:
             # Won't delete a non-temp dir (since it would just be deleting a string)
             del self.__directory
+            self.__directory = None
 
         # Get and return the sampled subgraph
         if isinstance(torch_geometric, MissingModule):

@@ -119,6 +119,13 @@ def convert_to_cudf(cp_arrays, weight_t, with_edge_properties, return_offsets=Fa
         df[edge_type_n] = edge_types
         df[hop_id_n] = hop_ids
 
+        print(
+            f"sources: {sources}\n"
+            f"destinations: {destinations}\n"
+            f"batch: {batch_ids}\n"
+            f"offset: {offsets}\n"
+        )
+
         if return_offsets:
             offsets_df = cudf.DataFrame(
                 {
@@ -297,6 +304,7 @@ def uniform_neighbor_sample(
         List of output GPUs (by rank) corresponding to batch
         id labels in the label list.  Used to assign each batch
         id to a GPU.
+        Must be in ascending order (i.e. [0, 0, 1, 2]).
 
     random_state: int, optional
         Random seed to use when making sampling calls.

@@ -15,11 +15,16 @@
 
 from typing import Union
 
+import cupy
 import cudf
 import dask_cudf
+import cugraph.dask as dask_cugraph
+
 import cugraph
 import pylibcugraph
 
+from cugraph.gnn.data_loading.bulk_sampler_io import write_samples
+
 
 class EXPERIMENTAL__BulkSampler:
     start_col_name = "_START_"
@@ -32,7 +37,6 @@ def __init__(
         graph,
         seeds_per_call: int = 200_000,
         batches_per_partition=100,
-        rank: int = 0,
         **kwargs,
     ):
         """
@@ -51,9 +55,6 @@ def __init__(
             a single sampling call.
         batches_per_partition: int (optional, default=100)
             The number of batches outputted to a single parquet partition.
-        rank: int (optional, default=0)
-            The rank of this sampler.  Used to isolate this sampler from
-            others that may be running on other nodes.
         kwargs: kwargs
             Keyword arguments to be passed to the sampler (i.e. fanout).
         """
@@ -75,14 +76,9 @@ def __init__(
         self.__graph = graph
         self.__seeds_per_call = seeds_per_call
         self.__batches_per_partition = batches_per_partition
-        self.__rank = rank
         self.__batches = None
         self.__sample_call_args = kwargs
 
-    @property
-    def rank(self) -> int:
-        return self.__rank
-
     @property
     def seeds_per_call(self) -> int:
         return self.__seeds_per_call
@@ -195,55 +191,49 @@ def flush(self) -> None:
             sample_fn = cugraph.uniform_neighbor_sample
         else:
             sample_fn = cugraph.dask.uniform_neighbor_sample
-            self.__sample_call_args["_multiple_clients"] = True
+            self.__sample_call_args.update(
+                {
+                    "_multiple_clients": True,
+                    "label_to_output_comm_rank": self.__get_label_to_output_comm_rank(
+                        min_batch_id, max_batch_id
+                    ),
+                    "label_list": cupy.arange(
+                        min_batch_id, max_batch_id + 1, dtype="int32"
+                    ),
+                }
+            )
 
-        samples = sample_fn(
+        samples, offsets = sample_fn(
             self.__graph,
             **self.__sample_call_args,
             start_list=self.__batches[self.start_col_name][batch_id_filter],
             batch_id_list=self.__batches[self.batch_col_name][batch_id_filter],
             with_edge_properties=True,
+            return_offsets=True,
         )
 
         self.__batches = self.__batches[~batch_id_filter]
-        self.__write(samples, min_batch_id, npartitions)
+        self.__write(samples, offsets)
 
         if self.size > 0:
             self.flush()
 
     def __write(
         self,
         samples: Union[cudf.DataFrame, dask_cudf.DataFrame],
-        min_batch_id: int,
-        npartitions: int,
+        offsets: Union[cudf.DataFrame, dask_cudf.DataFrame],
     ) -> None:
-        # Ensure each rank writes to its own partition so there is no conflict
-        outer_partition = f"rank={self.__rank}"
-        outer_partition_path = os.path.join(self.__output_path, outer_partition)
-        os.makedirs(outer_partition_path, exist_ok=True)
-
-        for partition_k in range(npartitions):
-            ix_partition_start_inclusive = (
-                min_batch_id + partition_k * self.batches_per_partition
-            )
-            ix_partition_end_inclusive = (
-                min_batch_id + (partition_k + 1) * self.batches_per_partition - 1
-            )
-            f = (samples.batch_id >= ix_partition_start_inclusive) & (
-                samples.batch_id <= ix_partition_end_inclusive
-            )
-            if len(samples[f]) == 0:
-                break
-
-            ix_partition_end_inclusive = samples[f].batch_id.max()
-            if hasattr(ix_partition_end_inclusive, "compute"):
-                ix_partition_end_inclusive = ix_partition_end_inclusive.compute()
-            ix_partition_end_inclusive = int(ix_partition_end_inclusive)
-
-            inner_path = os.path.join(
-                outer_partition_path,
-                f"batch={ix_partition_start_inclusive}-{ix_partition_end_inclusive}"
-                ".parquet",
-            )
+        os.makedirs(self.__output_path, exist_ok=True)
+        write_samples(
+            samples, offsets, self.__batches_per_partition, self.__output_path
+        )
+
+    def __get_label_to_output_comm_rank(self, min_batch_id, max_batch_id):
+        num_workers = dask_cugraph.get_n_workers()
+        num_batches = max_batch_id - min_batch_id + 1
+        z = cupy.zeros(num_batches, dtype="int32")
+        s = cupy.array_split(cupy.arange(num_batches), num_workers)
+        for i, t in enumerate(s):
+            z[t] = i
 
-            samples[f].to_parquet(inner_path, index=False)
+        return cudf.Series(z)