From 86b20382681c4e8adb41904d17daf398a3f5f204 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 19:52:08 -0800 Subject: [PATCH 01/60] add support for rng state --- cpp/include/cugraph_c/sampling_algorithms.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index f048d338b9..ac029181ba 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -83,6 +83,7 @@ cugraph_error_code_t cugraph_biased_random_walks( * @brief Compute random walks using the node2vec framework. * * @param [in] handle Handle for accessing resources + * @param [in,out] rng_state State of the random number generator, updated with each call * @param [in] graph Pointer to graph. NOTE: Graph might be modified if the storage * needs to be transposed * @param [in] start_vertices Array of source vertices @@ -98,6 +99,7 @@ cugraph_error_code_t cugraph_biased_random_walks( */ cugraph_error_code_t cugraph_node2vec_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, From 38690a6e8096b7785649e4243d408abd51324708 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 19:56:03 -0800 Subject: [PATCH 02/60] update test to take rng state parameter --- cpp/tests/c_api/sg_random_walks_test.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/tests/c_api/sg_random_walks_test.c b/cpp/tests/c_api/sg_random_walks_test.c index 05d77a0b3b..71c76f3f94 100644 --- a/cpp/tests/c_api/sg_random_walks_test.c +++ b/cpp/tests/c_api/sg_random_walks_test.c @@ -308,8 +308,12 @@ int generic_node2vec_random_walks_test(vertex_t* h_src, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + ret_code = cugraph_node2vec_random_walks( - handle, graph, d_start_view, max_depth, p, q, &result, &ret_error); + handle, rng_state, graph, d_start_view, max_depth, p, q, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "node2vec_random_walks failed."); From fd5b387d91cbf3c6a6f33b84d2587a77aea6e273 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 19:57:20 -0800 Subject: [PATCH 03/60] add support for rng state --- cpp/src/c_api/random_walks.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp index 705d210843..7d883df9dd 100644 --- a/cpp/src/c_api/random_walks.cpp +++ b/cpp/src/c_api/random_walks.cpp @@ -365,7 +365,6 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor { struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor { raft::handle_t const& handle_; - // FIXME: rng_state_ should be passed as a parameter cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr}; cugraph::c_api::cugraph_graph_t* graph_{nullptr}; cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr}; @@ -375,6 +374,7 @@ struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor { cugraph::c_api::cugraph_random_walk_result_t* result_{nullptr}; node2vec_random_walks_functor(cugraph_resource_handle_t const* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, cugraph_type_erased_device_array_view_t const* start_vertices, size_t max_length, @@ -382,6 +382,7 @@ struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor { double q) : abstract_functor(), handle_(*reinterpret_cast(handle)->handle_), + rng_state_(reinterpret_cast(rng_state)), graph_(reinterpret_cast(graph)), start_vertices_( reinterpret_cast( @@ -443,10 +444,6 @@ struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor { graph_view.local_vertex_partition_range_last(), false); - // FIXME: remove once rng_state passed as parameter - rng_state_ = reinterpret_cast( - new cugraph::c_api::cugraph_rng_state_t{raft::random::RngState{0}}); - auto [paths, weights] = cugraph::node2vec_random_walks( handle_, rng_state_->rng_state_, @@ -588,6 +585,7 @@ cugraph_error_code_t cugraph_biased_random_walks( cugraph_error_code_t cugraph_node2vec_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, @@ -604,7 +602,7 @@ cugraph_error_code_t cugraph_node2vec_random_walks( "vertex type of graph and start_vertices must match", *error); - node2vec_random_walks_functor functor(handle, graph, start_vertices, max_length, p, q); + node2vec_random_walks_functor functor(handle, rng_state, graph, start_vertices, max_length, p, q); return cugraph::c_api::run_algorithm(graph, functor, result, error); } From 9d56b5f354dc4e23e650136457c6150e87a28d9a Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 20:09:12 -0800 Subject: [PATCH 04/60] deprecate old API --- python/pylibcugraph/pylibcugraph/node2vec.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index 0e0fd73e6c..5729dc6e05 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -66,6 +66,8 @@ def node2vec(ResourceHandle resource_handle, """ Computes random walks under node2vec sampling procedure. + This API is deprecated call node2vec_random_walks instead + Parameters ---------- resource_handle : ResourceHandle From 615837e80d55b407f932372501ec68df1249792d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 20:12:57 -0800 Subject: [PATCH 05/60] add new API for node2vec random walks --- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 1 + python/pylibcugraph/pylibcugraph/__init__.py | 2 + .../pylibcugraph/_cugraph_c/algorithms.pxd | 19 +- .../pylibcugraph/node2vec_random_walks.pyx | 184 ++++++++++++++++++ 4 files changed, 205 insertions(+), 1 deletion(-) create mode 100644 python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index fe7c4b64aa..44963bdc5e 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -44,6 +44,7 @@ set(cython_sources leiden.pyx louvain.pyx node2vec.pyx + node2vec_random_walks.pyx pagerank.pyx personalized_pagerank.pyx random.pyx diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 9047144c13..cd5b23db1a 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -37,6 +37,8 @@ from pylibcugraph.node2vec import node2vec +from pylibcugraph.node2vec_random_walks import node2vec_random_walks + from pylibcugraph.bfs import bfs from pylibcugraph.uniform_neighbor_sample import uniform_neighbor_sample diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index 38781614b2..a8e5bb7fdc 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -24,6 +24,9 @@ from pylibcugraph._cugraph_c.error cimport ( cugraph_error_code_t, cugraph_error_t, ) +from pylibcugraph._cugraph_c.random cimport ( + cugraph_rng_state_t, +) from pylibcugraph._cugraph_c.array cimport ( cugraph_type_erased_device_array_view_t, cugraph_type_erased_host_array_view_t, @@ -148,7 +151,7 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_random_walk_result_t* result ) - # node2vec + # node2vec - Deprecated, call node2vec_random_walks instead cdef cugraph_error_code_t \ cugraph_node2vec( const cugraph_resource_handle_t* handle, @@ -377,3 +380,17 @@ cdef extern from "cugraph_c/algorithms.h": cugraph_random_walk_result_t** result, cugraph_error_t** error ) + + # node2vec random walks + cdef cugraph_error_code_t \ + cugraph_node2vec_random_walks( + const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* start_vertices, + size_t max_length, + double p, + double q, + cugraph_random_walk_result_t** result, + cugraph_error_t** error + ) \ No newline at end of file diff --git a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx new file mode 100644 index 0000000000..6df1472787 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx @@ -0,0 +1,184 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_create, + cugraph_type_erased_device_array_view_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.algorithms cimport ( + cugraph_node2vec_random_walks, + cugraph_random_walk_result_t, + cugraph_random_walk_result_get_paths, + cugraph_random_walk_result_get_weights, + cugraph_random_walk_result_free, +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph._cugraph_c.random cimport ( + cugraph_rng_state_t +) +from pylibcugraph.random cimport ( + CuGraphRandomState +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + assert_CAI_type, + get_c_type_from_numpy_type, +) + + +def node2vec(ResourceHandle resource_handle, + _GPUGraph graph, + seed_array, + size_t max_depth, + double p, + double q, + random_state=None): + """ + Computes random walks under node2vec sampling procedure. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph + The input graph. + + seed_array: device array type + Device array containing the pointer to the array of seed vertices. + + max_depth : size_t + Maximum number of vertices in generated path + + p : double + The return factor p represents the likelihood of backtracking to a node + in the walk. A higher value (> max(q, 1)) makes it less likely to sample + a previously visited node, while a lower value (< min(q, 1)) would make it + more likely to backtrack, making the walk more "local". + + q : double + The in-out factor q represents the likelihood of visiting nodes closer or + further from the outgoing node. If q > 1, the random walk is likelier to + visit nodes closer to the outgoing node. If q < 1, the random walk is + likelier to visit nodes further from the outgoing node. + + random_state: int (Optional) + Random state to use when generating samples. Optional argument, + defaults to a hash of process id, time, and hostname. + (See pylibcugraph.random.CuGraphRandomState) + + Returns + ------- + A tuple of device arrays, where the first item in the tuple is a device + array containing the compressed paths, the second item is a device + array containing the corresponding weights for each edge traversed in + each path. + + Examples + -------- + >>> import pylibcugraph, cupy, numpy + >>> srcs = cupy.asarray([0, 1, 2], dtype=numpy.int32) + >>> dsts = cupy.asarray([1, 2, 3], dtype=numpy.int32) + >>> seeds = cupy.asarray([0, 0, 1], dtype=numpy.int32) + >>> weights = cupy.asarray([1.0, 1.0, 1.0], dtype=numpy.float32) + >>> resource_handle = pylibcugraph.ResourceHandle() + >>> graph_props = pylibcugraph.GraphProperties( + ... is_symmetric=False, is_multigraph=False) + >>> G = pylibcugraph.SGGraph( + ... resource_handle, graph_props, srcs, dsts, weight_array=weights, + ... store_transposed=False, renumber=False, do_expensive_check=False) + >>> (paths, weights, sizes) = pylibcugraph.node2vec( + ... resource_handle, G, seeds, 3, 1.0, 1.0) + + """ + + # FIXME: import these modules here for now until a better pattern can be + # used for optional imports (perhaps 'import_optional()' from cugraph), or + # these are made hard dependencies. + try: + import cupy + except ModuleNotFoundError: + raise RuntimeError("node2vec requires the cupy package, which could not " + "be imported") + assert_CAI_type(seed_array, "seed_array") + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_random_walk_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef uintptr_t cai_seed_ptr = \ + seed_array.__cuda_array_interface__["data"][0] + cdef cugraph_type_erased_device_array_view_t* seed_view_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_seed_ptr, + len(seed_array), + get_c_type_from_numpy_type(seed_array.dtype)) + + cg_rng_state = CuGraphRandomState(resource_handle, random_state) + + cdef cugraph_rng_state_t* rng_state_ptr = \ + cg_rng_state.rng_state_ptr + + error_code = cugraph_node2vec_random_walks(c_resource_handle_ptr, + rng_state_ptr, + c_graph_ptr, + seed_view_ptr, + max_depth, + p, + q, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_node2vec_random_walks") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* paths_ptr = \ + cugraph_random_walk_result_get_paths(result_ptr) + cdef cugraph_type_erased_device_array_view_t* weights_ptr = \ + cugraph_random_walk_result_get_weights(result_ptr) + + cupy_paths = copy_to_cupy_array(c_resource_handle_ptr, paths_ptr) + cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) + + cugraph_random_walk_result_free(result_ptr) + cugraph_type_erased_device_array_view_free(seed_view_ptr) + + return (cupy_paths, cupy_weights) From 21a76bb4921197896f4484bcbca34d9b89ea15bb Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 20:20:37 -0800 Subject: [PATCH 06/60] add mg node2vec random walks to the python API --- python/cugraph/cugraph/dask/__init__.py | 1 + .../dask/sampling/node2vec_random_walks.py | 218 ++++++++++++++++++ 2 files changed, 219 insertions(+) create mode 100644 python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index b1588008bc..b8753fc461 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -28,6 +28,7 @@ from .components.connectivity import weakly_connected_components from .sampling.uniform_neighbor_sample import uniform_neighbor_sample from .sampling.random_walks import random_walks +from .sampling.node2vec_random_walks import node2vec_random_walks from .centrality.eigenvector_centrality import eigenvector_centrality from .cores.core_number import core_number from .centrality.betweenness_centrality import betweenness_centrality diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py new file mode 100644 index 0000000000..d70601841c --- /dev/null +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -0,0 +1,218 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dask.distributed import wait, default_client +import dask_cudf +import cudf +import operator as op +from cugraph.dask.common.part_utils import ( + persist_dask_df_equal_parts_per_worker, +) + +from pylibcugraph import ResourceHandle + +from pylibcugraph import ( + node2vec_random_walks as pylibcugraph_node2vec_random_walks, +) + +from cugraph.dask.comms import comms as Comms + + +def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): + """ + Creates cudf Series from cupy arrays from pylibcugraph wrapper + """ + + if is_vertex_paths and len(cp_paths) > 0: + if number_map.implementation.numbered: + df_ = cudf.DataFrame() + df_["vertex_paths"] = cp_paths + df_ = number_map.unrenumber( + df_, "vertex_paths", preserve_order=True + ).compute() + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + + return vertex_paths + + return cudf.Series(cp_paths) + + +def _call_plc_node2vec_random_walks(sID, mg_graph_x, st_x, max_depth, compress_result, p, q): + + return pylibcugraph_node2vec_random_walks( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + seed_array=st_x, + max_depth=max_depth, + compress_result=compress_result, + p=p, + q=q + ) + + +def node2vec_random_walks( + input_graph, + start_vertices=None, + max_depth=None, + compress_result=True, + p=1.0, + q=1.0 +): + """ + Computes random walks for each node in 'start_vertices', under the + node2vec_random_walks sampling framework. + + parameters + ---------- + input_graph : cuGraph.Graph + The graph can be either directed or undirected. + + start_vertices: int or list or cudf.Series or cudf.DataFrame + A single node or a list or a cudf.Series of nodes from which to run + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame. Only supports int32 currently. + + max_depth: int, optional (default=1) + The maximum depth of the random walks. If not specified, the maximum + depth is set to 1. + + compress_result: bool, optional (default=True) + If True, coalesced paths are returned with a sizes array with offsets. + Otherwise padded paths are returned with an empty sizes array. + + p: float, optional (default=1.0, [0 < p]) + Return factor, which represents the likelihood of backtracking to + a previous node in the walk. A higher value makes it less likely to + sample a previously visited node, while a lower value makes it more + likely to backtrack, making the walk "local". A positive float. + + q: float, optional (default=1.0, [0 < q]) + In-out factor, which represents the likelihood of visiting nodes + closer or further from the outgoing node. If q > 1, the random walk + is likelier to visit nodes closer to the outgoing node. If q < 1, the + random walk is likelier to visit nodes further from the outgoing node. + A positive float. + + Returns + ------- + vertex_paths : dask_cudf.Series or dask_cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: dask_cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + sizes : dask_cudf.Series + The path size or sizes in case of coalesced paths. + """ + client = default_client() + + if (not isinstance(max_depth, int)) or (max_depth < 1): + raise ValueError( + f"'max_depth' must be a positive integer, " f"got: {max_depth}" + ) + if not isinstance(compress_result, bool): + raise ValueError( + f"'compress_result' must be a bool, " f"got: {compress_result}" + ) + if (not isinstance(p, float)) or (p <= 0.0): + raise ValueError(f"'p' must be a positive float, got: {p}") + if (not isinstance(q, float)) or (q <= 0.0): + raise ValueError(f"'q' must be a positive float, got: {q}") + + + + if isinstance(start_vertices, int): + start_vertices = [start_vertices] + + if isinstance(start_vertices, list): + start_vertices = cudf.Series(start_vertices) + + # start_vertices uses "external" vertex IDs, but if the graph has been + # renumbered, the start vertex IDs must also be renumbered. + if input_graph.renumbered: + # FIXME: This should match start_vertices type to the renumbered df type + # but verify that. If not retrieve the type and cast it when creating + # the dask_cudf from a cudf + start_vertices = input_graph.lookup_internal_vertex_id(start_vertices).compute() + start_vertices_type = input_graph.edgelist.edgelist_df.dtypes[0] + else: + # FIXME: Get the 'src' column names instead and retrieve the type + start_vertices_type = input_graph.input_df.dtypes.iloc[0] + start_vertices = dask_cudf.from_cudf( + start_vertices, npartitions=min(input_graph._npartitions, len(start_vertices)) + ) + start_vertices = start_vertices.astype(start_vertices_type) + start_vertices = persist_dask_df_equal_parts_per_worker( + start_vertices, client, return_type="dict" + ) + + #print("start vertex_type = ", start_vertices_type) + #print("edgelist type = ", input_graph.edgelist.edgelist_df) + + result = [ + client.submit( + _call_plc_node2vec_random_walks, + Comms.get_session_id(), + input_graph._plc_graph[w], + start_v[0] if start_v else cudf.Series(dtype=start_vertices_type), + max_depth, + compress_result=compress_result, + p=p, + q=q, + workers=[w], + allow_other_workers=False, + ) + for w, start_v in start_vertices.items() + ] + + wait(result) + + result_vertex_paths = [client.submit(op.getitem, f, 0) for f in result] + result_edge_wgt_paths = [client.submit(op.getitem, f, 1) for f in result] + result_sizes = [client.submit(op.getitem, f, 2) for f in result] + + cudf_vertex_paths = [ + client.submit(convert_to_cudf, cp_vertex_paths, input_graph.renumber_map, True) + for cp_vertex_paths in result_vertex_paths + ] + + cudf_edge_wgt_paths = [ + client.submit(convert_to_cudf, cp_edge_wgt_paths) + for cp_edge_wgt_paths in result_edge_wgt_paths + ] + + cudf_sizes = [ + client.submit(convert_to_cudf, cp_sizes) + for cp_sizes in result_sizes + ] + + wait([cudf_vertex_paths, cudf_edge_wgt_paths, cudf_sizes]) + + + ddf_vertex_paths = dask_cudf.from_delayed(cudf_vertex_paths).persist() + ddf_edge_wgt_paths = dask_cudf.from_delayed(cudf_edge_wgt_paths).persist() + ddf_sizes = dask_cudf.from_delayed(cudf_sizes).persist() + #wait([ddf_vertex_paths, ddf_edge_wgt_paths]) + + # Wait until the inactive futures are released + wait( + [ + (r.release(), c_v.release(), c_e.release()) + for r, c_v, c_e, c_s in zip(result, cudf_vertex_paths, cudf_edge_wgt_paths, cudf_sizes) + ] + ) + + return ddf_vertex_paths, ddf_edge_wgt_paths, ddf_sizes + From 86a13d3d0b058af28816694f21add0634696838f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 20:29:07 -0800 Subject: [PATCH 07/60] update docstrings --- python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx index 6df1472787..fc391a5fec 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx @@ -121,8 +121,8 @@ def node2vec(ResourceHandle resource_handle, >>> G = pylibcugraph.SGGraph( ... resource_handle, graph_props, srcs, dsts, weight_array=weights, ... store_transposed=False, renumber=False, do_expensive_check=False) - >>> (paths, weights, sizes) = pylibcugraph.node2vec( - ... resource_handle, G, seeds, 3, 1.0, 1.0) + >>> (paths, weights) = pylibcugraph.node2vec_random_walks( + ... resource_handle, G, seeds, 3, 1.0, 1.0) """ From 4c8744f2716a3681b4fe686418be0b08ccd35ea0 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 20:40:08 -0800 Subject: [PATCH 08/60] enable mg node2vec_random walks --- cpp/src/c_api/random_walks.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp index 7d883df9dd..e21090f6d0 100644 --- a/cpp/src/c_api/random_walks.cpp +++ b/cpp/src/c_api/random_walks.cpp @@ -404,8 +404,6 @@ struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor { // FIXME: Think about how to handle SG vice MG if constexpr (!cugraph::is_candidate::value) { unsupported(); - } else if constexpr (multi_gpu) { - unsupported(); } else { // random walks expects store_transposed == false if constexpr (store_transposed) { From 4da1c7ed3bdd11bf915a37446c7336793c3c419b Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 20:42:22 -0800 Subject: [PATCH 09/60] update argument list in function call --- .../dask/sampling/node2vec_random_walks.py | 48 ++++++------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py index d70601841c..18171eda62 100644 --- a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -48,30 +48,31 @@ def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): return cudf.Series(cp_paths) -def _call_plc_node2vec_random_walks(sID, mg_graph_x, st_x, max_depth, compress_result, p, q): +def _call_plc_node2vec_random_walks(sID, mg_graph_x, st_x, max_depth, p, q, random_state): return pylibcugraph_node2vec_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), graph=mg_graph_x, seed_array=st_x, max_depth=max_depth, - compress_result=compress_result, p=p, - q=q + q=q, + random_state=random_state ) +# FIXME: Add type anotation def node2vec_random_walks( input_graph, start_vertices=None, max_depth=None, - compress_result=True, p=1.0, - q=1.0 + q=1.0, + random_state=None ): """ Computes random walks for each node in 'start_vertices', under the - node2vec_random_walks sampling framework. + node2vec sampling framework. parameters ---------- @@ -87,10 +88,6 @@ def node2vec_random_walks( The maximum depth of the random walks. If not specified, the maximum depth is set to 1. - compress_result: bool, optional (default=True) - If True, coalesced paths are returned with a sizes array with offsets. - Otherwise padded paths are returned with an empty sizes array. - p: float, optional (default=1.0, [0 < p]) Return factor, which represents the likelihood of backtracking to a previous node in the walk. A higher value makes it less likely to @@ -103,6 +100,9 @@ def node2vec_random_walks( is likelier to visit nodes closer to the outgoing node. If q < 1, the random walk is likelier to visit nodes further from the outgoing node. A positive float. + + random_state: int, optional + Random seed to use when making sampling calls. Returns ------- @@ -112,9 +112,6 @@ def node2vec_random_walks( edge_weight_paths: dask_cudf.Series Series containing the edge weights of edges represented by the returned vertex_paths - - sizes : dask_cudf.Series - The path size or sizes in case of coalesced paths. """ client = default_client() @@ -122,10 +119,6 @@ def node2vec_random_walks( raise ValueError( f"'max_depth' must be a positive integer, " f"got: {max_depth}" ) - if not isinstance(compress_result, bool): - raise ValueError( - f"'compress_result' must be a bool, " f"got: {compress_result}" - ) if (not isinstance(p, float)) or (p <= 0.0): raise ValueError(f"'p' must be a positive float, got: {p}") if (not isinstance(q, float)) or (q <= 0.0): @@ -158,9 +151,6 @@ def node2vec_random_walks( start_vertices, client, return_type="dict" ) - #print("start vertex_type = ", start_vertices_type) - #print("edgelist type = ", input_graph.edgelist.edgelist_df) - result = [ client.submit( _call_plc_node2vec_random_walks, @@ -168,9 +158,9 @@ def node2vec_random_walks( input_graph._plc_graph[w], start_v[0] if start_v else cudf.Series(dtype=start_vertices_type), max_depth, - compress_result=compress_result, p=p, q=q, + random_state=random_state, workers=[w], allow_other_workers=False, ) @@ -181,7 +171,6 @@ def node2vec_random_walks( result_vertex_paths = [client.submit(op.getitem, f, 0) for f in result] result_edge_wgt_paths = [client.submit(op.getitem, f, 1) for f in result] - result_sizes = [client.submit(op.getitem, f, 2) for f in result] cudf_vertex_paths = [ client.submit(convert_to_cudf, cp_vertex_paths, input_graph.renumber_map, True) @@ -193,26 +182,17 @@ def node2vec_random_walks( for cp_edge_wgt_paths in result_edge_wgt_paths ] - cudf_sizes = [ - client.submit(convert_to_cudf, cp_sizes) - for cp_sizes in result_sizes - ] - - wait([cudf_vertex_paths, cudf_edge_wgt_paths, cudf_sizes]) + wait([cudf_vertex_paths, cudf_edge_wgt_paths]) - ddf_vertex_paths = dask_cudf.from_delayed(cudf_vertex_paths).persist() ddf_edge_wgt_paths = dask_cudf.from_delayed(cudf_edge_wgt_paths).persist() - ddf_sizes = dask_cudf.from_delayed(cudf_sizes).persist() - #wait([ddf_vertex_paths, ddf_edge_wgt_paths]) - # Wait until the inactive futures are released wait( [ (r.release(), c_v.release(), c_e.release()) - for r, c_v, c_e, c_s in zip(result, cudf_vertex_paths, cudf_edge_wgt_paths, cudf_sizes) + for r, c_v, c_e in zip(result, cudf_vertex_paths, cudf_edge_wgt_paths) ] ) - return ddf_vertex_paths, ddf_edge_wgt_paths, ddf_sizes + return ddf_vertex_paths, ddf_edge_wgt_paths From 6984645690f8ee7ddfa4620bf2c8b3f75d90d8ee Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 27 Dec 2024 21:26:33 -0800 Subject: [PATCH 10/60] support optional weights --- .../pylibcugraph/node2vec_random_walks.pyx | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx index fc391a5fec..d1c4760d1f 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx @@ -59,7 +59,7 @@ from pylibcugraph.utils cimport ( ) -def node2vec(ResourceHandle resource_handle, +def node2vec_random_walks(ResourceHandle resource_handle, _GPUGraph graph, seed_array, size_t max_depth, @@ -172,11 +172,14 @@ def node2vec(ResourceHandle resource_handle, # arrays for returning. cdef cugraph_type_erased_device_array_view_t* paths_ptr = \ cugraph_random_walk_result_get_paths(result_ptr) - cdef cugraph_type_erased_device_array_view_t* weights_ptr = \ - cugraph_random_walk_result_get_weights(result_ptr) + + if graph.weights_view_ptr is NULL and graph.weights_view_ptr_ptr is NULL: + cupy_weights = None + else: + weights_ptr = cugraph_random_walk_result_get_weights(result_ptr) + cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) cupy_paths = copy_to_cupy_array(c_resource_handle_ptr, paths_ptr) - cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) cugraph_random_walk_result_free(result_ptr) cugraph_type_erased_device_array_view_free(seed_view_ptr) From d04588aeb7962c6ca05ce54fddff04c59a9ece51 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 19:53:05 -0800 Subject: [PATCH 11/60] update docstring and deprecate arguments --- .../cugraph/cugraph/sampling/random_walks.py | 44 +++++++++++++++++-- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index 1c56dbbe32..d4f8c3c53d 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -65,7 +65,7 @@ def random_walks( with 0.0s (when 'legacy_result_type' is 'True'). If 'legacy_result_type' is 'False', 'random_walks' returns padded results (vertex_paths, edge_weight_paths) but instead of 'sizes = None', returns the 'max_path_lengths'. - When 'legacy_result_type' is 'False', the arhument 'use_padding' is ignored. + When 'legacy_result_type' is 'False', the argument 'use_padding' is ignored. parameters ---------- @@ -81,6 +81,8 @@ def random_walks( Type of random walks: 'uniform', 'biased', 'node2vec'. Only 'uniform' random walks is currently supported + Deprecated + start_vertices : int or list or cudf.Series or cudf.DataFrame A single node or a list or a cudf.Series of nodes from which to run the random walks. In case of multi-column vertices it should be @@ -126,13 +128,39 @@ def random_walks( """ - if legacy_result_type: + warning_msg = ( + "random_walks is deprecated and will be removed " + "in the next release in favor of uniform_random_walks" + ) + warnings.warn(warning_msg, FutureWarning) + + # FIXME: Coalesced path results have been deprecated and should no longer be + # supported in 25.02. + # Context for legacy_result_type: The initial implementation of random_walks + # returned results where the vertex and weight path are proportional to the + # number of vertices instead of the number of edges hence the flag + # 'legacy_result_type' was created. This flag should be removed in favor of + # returning results paths proprtional to the number of edges. Furthermore, + # Coalesced path results should also be removed in favor of always returning + # padded results. The flags 'legacy_result_type' and 'use_padding" should be + # removed. + + if legacy_result_type or use_padding is False: warning_msg = ( "Coalesced path results, returned when setting legacy_result_type=True, " "is deprecated and will no longer be supported in the next releases. " "only padded paths will be returned instead" ) warnings.warn(warning_msg, PendingDeprecationWarning) + + if random_walks_type != "uniform": + warning_msg = ( + "random_walks_type is deprecated and will be removed " + "in the next release. If random_walks_type == 'biased' or 'node2vec, " + "call 'biased_random_walks' or 'node2vec_random_walks'." + ) + warnings.warn(warning_msg, FutureWarning) + if max_depth is None: raise TypeError("must specify a 'max_depth'") @@ -142,6 +170,9 @@ def random_walks( # data struct like a dictionary, etc.). The 2nd value is ignored here, # which is typically named isNx and used to convert the return type. # Consider a different return type if Nx types are passed in. + # The new API for random walk should instead always return the triple + # (vertex_paths, edge_wgt_paths, max_path_length) + G, _ = ensure_cugraph_obj_for_nx(G) if isinstance(start_vertices, int): @@ -191,7 +222,7 @@ def random_walks( ) warnings.warn(warning_msg, PendingDeprecationWarning) - # Drop the last vertex and and edge weight from each vertex and edge weight + # Drop the last vertex and edge weight from each vertex and edge weight # paths. vertex_paths = vertex_paths.drop( index=vertex_paths[max_depth :: max_depth + 1].index @@ -202,11 +233,16 @@ def random_walks( ).reset_index(drop=True) if use_padding: + # When padding, the 'sizes' array is not necessary because + # 'vertex_paths' and 'edge_wgt_paths' contain all information + # because of the padding factor. sizes = None # FIXME: Is it necessary to slice it with 'edge_wgt_paths_sz'? return vertex_paths, edge_wgt_paths, sizes # If 'use_padding' is False, compute the sizes of the unpadded results + # since the padded value (-1) will be removed which will make it difficult + # to identify the end and the beginning of a new path. sizes = ( vertex_paths.apply(lambda x: 1 if x != -1 else 0) @@ -251,7 +287,7 @@ def rw_path( Returns ------- path_data : cudf.DataFrame - Dataframe containing vetex path offsets, edge weight offsets and + Dataframe containing vertex path offsets, edge weight offsets and edge weight sizes for each path. """ From cb6a29414b79ea413debb4d02f1e14b599da40f0 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 19:54:47 -0800 Subject: [PATCH 12/60] add new API for uniform_random_walks --- .../cugraph/sampling/uniform_random_walks.py | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 python/cugraph/cugraph/sampling/uniform_random_walks.py diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py new file mode 100644 index 0000000000..72484240db --- /dev/null +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -0,0 +1,166 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +import cupy as cp +from pylibcugraph import ResourceHandle +from pylibcugraph import ( + uniform_random_walks as pylibcugraph_uniform_random_walks, +) + +from cugraph.structure import Graph + +import warnings +from typing import Union, Tuple + + +def uniform_random_walks( + G: Graph, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame] = None, + max_depth: int = None, +) -> Tuple[cudf.Series, cudf.Series, Union[None, int, cudf.Series]]: + """ + Compute uniform random walks for each nodes in 'start_vertices'. + Vertices with no outgoing edges will be padded with -1 and the corresponding + edge weights with 0.0. + + parameters + ---------- + G : cuGraph.Graph + The graph can be either directed or undirected. + + start_vertices : int or list or cudf.Series or cudf.DataFrame + A single node or a list or a cudf.Series of nodes from which to run + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame + + max_depth : int + The maximum depth of the random walks + + The max depth is relative to the number of edges hence the vertex_paths size + is max_depth + 1. For instance, a 'max_depth' of 2 with only one seed will + result in a vertex_path of size 3. + + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + and + + max_path_length : int + The maximum path length. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> M = karate.get_edgelist(download=True) + >>> G = karate.get_graph() + >>> start_vertices = G.nodes()[:4] + >>> _, _, _ = cugraph.uniform_random_walks(G, start_vertices, 3) + + """ + + if max_depth is None: + raise TypeError("must specify a 'max_depth'") + + if isinstance(start_vertices, int): + start_vertices = [start_vertices] + + if isinstance(start_vertices, list): + # Ensure the 'start_vertices' have the same dtype as the edge list. + # Failing to do that may produce erroneous results. + vertex_dtype = G.edgelist.edgelist_df.dtypes.iloc[0] + start_vertices = cudf.Series(start_vertices, dtype=vertex_dtype) + + if G.renumbered is True: + if isinstance(start_vertices, cudf.DataFrame): + start_vertices = G.lookup_internal_vertex_id( + start_vertices, start_vertices.columns + ) + else: + start_vertices = G.lookup_internal_vertex_id(start_vertices) + + vertex_paths, edge_wgt_paths, max_path_length = pylibcugraph_uniform_random_walks( + resource_handle=ResourceHandle(), + input_graph=G._plc_graph, + start_vertices=start_vertices, + max_length=max_depth, + ) + + vertex_paths = cudf.Series(vertex_paths) + + if G.renumbered: + df_ = cudf.DataFrame() + df_["vertex_paths"] = vertex_paths + df_ = G.unrenumber(df_, "vertex_paths", preserve_order=True) + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + + edge_wgt_paths = cudf.Series(edge_wgt_paths) + + return ( + vertex_paths, + edge_wgt_paths, + max_path_length, + ) + + +def rw_path( + num_paths: int, sizes: cudf.Series +) -> Tuple[cudf.Series, cudf.Series, cudf.Series]: + """ + Retrieve more information on the obtained paths in case use_padding + is False. + + parameters + ---------- + num_paths: int + Number of paths in the random walk output. + + sizes: cudf.Series + Path size returned in random walk output. + + Returns + ------- + path_data : cudf.DataFrame + Dataframe containing vertex path offsets, edge weight offsets and + edge weight sizes for each path. + """ + + vertex_offsets = cudf.Series(0, dtype=sizes.dtype) + vertex_offsets = cudf.concat( + [vertex_offsets, sizes.cumsum()[:-1]], ignore_index=True + ) + weight_sizes = sizes - 1 + + weight_offsets = cudf.Series(0, dtype=sizes.dtype) + num_edges = vertex_offsets.diff()[1:] - 1 + + weight_offsets = cudf.concat( + [weight_offsets, num_edges.cumsum()], ignore_index=True + ) + # FIXME: CUDF bug. concatenating two series of type int32 but get a CUDF of + # type 'int64' have to cast the results + weight_offsets = weight_offsets.astype(sizes.dtype) + + path_data = cudf.DataFrame() + path_data["vertex_offsets"] = vertex_offsets + path_data["weight_sizes"] = weight_sizes + path_data["weight_offsets"] = weight_offsets + + return path_data[:num_paths] From 793602628d21e4362c827b100ae85cc0f566bbe7 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 20:02:30 -0800 Subject: [PATCH 13/60] deprecate method --- python/cugraph/cugraph/sampling/__init__.py | 1 + python/cugraph/cugraph/sampling/random_walks.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index de5c43bdd0..3af0b84a6e 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -12,5 +12,6 @@ # limitations under the License. from cugraph.sampling.random_walks import random_walks, rw_path +from cugraph.sampling.uniform_random_walks import uniform_random_walks from cugraph.sampling.node2vec import node2vec from cugraph.sampling.uniform_neighbor_sample import uniform_neighbor_sample diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index d4f8c3c53d..f0177d92c4 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -291,6 +291,13 @@ def rw_path( edge weight sizes for each path. """ + warning_msg = ( + "This method is deprecated in favor of always returning " + "padded results." + ) + + warnings.warn(warning_msg, PendingDeprecationWarning) + vertex_offsets = cudf.Series(0, dtype=sizes.dtype) vertex_offsets = cudf.concat( [vertex_offsets, sizes.cumsum()[:-1]], ignore_index=True From 7a5056fcd4c5f5da3a9b356658ec9591996692e9 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 20:03:25 -0800 Subject: [PATCH 14/60] update copyrights --- python/cugraph/cugraph/sampling/__init__.py | 2 +- python/cugraph/cugraph/sampling/uniform_random_walks.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index 3af0b84a6e..723457d62c 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py index 72484240db..36ea7eeeec 100644 --- a/python/cugraph/cugraph/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From e2e4694f2c6ec62e63bb141738107b62f5e46f53 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 20:04:50 -0800 Subject: [PATCH 15/60] add uniform random walks --- python/cugraph/cugraph/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index ada1fec74c..3f960433ca 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -108,6 +108,7 @@ from cugraph.sampling import ( random_walks, + uniform_random_walks, rw_path, node2vec, uniform_neighbor_sample, From 877265b1f258dd0e39ecfe165339ab56d77b0c09 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 20:24:03 -0800 Subject: [PATCH 16/60] add new API for node2vec random walks --- .../cugraph/sampling/node2vec_random_walks.py | 160 ++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 python/cugraph/cugraph/sampling/node2vec_random_walks.py diff --git a/python/cugraph/cugraph/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/sampling/node2vec_random_walks.py new file mode 100644 index 0000000000..05c256c63a --- /dev/null +++ b/python/cugraph/cugraph/sampling/node2vec_random_walks.py @@ -0,0 +1,160 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from pylibcugraph import ( + ResourceHandle, + node2vec as pylibcugraph_node2vec_random_walks, +) +import warnings + +import cudf + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, start_vertices): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes.iloc[0] + if isinstance(start_vertices, cudf.Series): + start_vertices_dtype = start_vertices.dtype + else: + start_vertices_dtype = start_vertices.dtypes.iloc[0] + + if start_vertices_dtype != vertex_dtype: + warning_msg = ( + "Node2vec requires 'start_vertices' to match the graph's " + f"'vertex' type. input graph's vertex type is: {vertex_dtype} and got " + f"'start_vertices' of type: {start_vertices_dtype}." + ) + warnings.warn(warning_msg, UserWarning) + start_vertices = start_vertices.astype(vertex_dtype) + + return start_vertices + + +def node2vec(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_state=None): + """ + Computes random walks for each node in 'start_vertices', under the + node2vec sampling framework. + + References + ---------- + + A Grover, J Leskovec: node2vec: Scalable Feature Learning for Networks, + Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge + Discovery and Data Mining, https://arxiv.org/abs/1607.00653 + + Parameters + ---------- + G : cuGraph.Graph + The graph can be either directed or undirected. + Weights in the graph are ignored. + + start_vertices: int or list or cudf.Series or cudf.DataFrame + A single node or a list or a cudf.Series of nodes from which to run + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame. Only supports int32 currently. + + max_depth: int, optional (default=1) + The maximum depth of the random walks. If not specified, the maximum + depth is set to 1. + + p: float, optional (default=1.0, [0 < p]) + Return factor, which represents the likelihood of backtracking to + a previous node in the walk. A higher value makes it less likely to + sample a previously visited node, while a lower value makes it more + likely to backtrack, making the walk "local". A positive float. + + q: float, optional (default=1.0, [0 < q]) + In-out factor, which represents the likelihood of visiting nodes + closer or further from the outgoing node. If q > 1, the random walk + is likelier to visit nodes closer to the outgoing node. If q < 1, the + random walk is likelier to visit nodes further from the outgoing node. + A positive float. + + random_state: int, optional + Random seed to use when making sampling calls. + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + and + + max_path_length : int + The maximum path length. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> G = karate.get_graph(download=True) + >>> start_vertices = cudf.Series([0, 2], dtype=np.int32) + >>> paths, weights, max_length = cugraph.node2vec_random_walks(G, + ... start_vertices, 3, + ... 0.8, 0.5) + + """ + if (not isinstance(max_depth, int)) or (max_depth < 1): + raise ValueError( + f"'max_depth' must be a positive integer, " f"got: {max_depth}" + ) + if (not isinstance(p, float)) or (p <= 0.0): + raise ValueError(f"'p' must be a positive float, got: {p}") + if (not isinstance(q, float)) or (q <= 0.0): + raise ValueError(f"'q' must be a positive float, got: {q}") + + + if isinstance(start_vertices, int): + start_vertices = [start_vertices] + + if isinstance(start_vertices, list): + start_vertices = cudf.Series(start_vertices, dtype="int32") + # FIXME: Verify if this condition still holds + if start_vertices.dtype != "int32": + raise ValueError( + f"'start_vertices' must have int32 values, " + f"got: {start_vertices.dtype}" + ) + + if G.renumbered is True: + if isinstance(start_vertices, cudf.DataFrame): + start_vertices = G.lookup_internal_vertex_id( + start_vertices, start_vertices.columns + ) + else: + start_vertices = G.lookup_internal_vertex_id(start_vertices) + + start_vertices = ensure_valid_dtype(G, start_vertices) + + vertex_set, edge_set = pylibcugraph_node2vec_random_walks( + resource_handle=ResourceHandle(), + graph=G._plc_graph, + seed_array=start_vertices, + max_depth=max_depth, + p=p, + q=q, + random_state=random_state + ) + vertex_set = cudf.Series(vertex_set) + edge_set = cudf.Series(edge_set) + + if G.renumbered: + df_ = cudf.DataFrame() + df_["vertex_set"] = vertex_set + df_ = G.unrenumber(df_, "vertex_set", preserve_order=True) + vertex_set = cudf.Series(df_["vertex_set"]) + return vertex_set, edge_set, max_depth From bb772377c5a0acb01399c75cd2f387de9e290b82 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 20:25:18 -0800 Subject: [PATCH 17/60] deprecate legacy implementation --- python/cugraph/cugraph/sampling/node2vec.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index eb91bfec82..b5c93873fd 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -112,6 +112,12 @@ def node2vec(G, start_vertices, max_depth=1, compress_result=True, p=1.0, q=1.0) ... True, 0.8, 0.5) """ + warning_msg = ( + "random_walks is deprecated and will be removed " + "in the next release in favor of uniform_random_walks" + ) + warnings.warn(warning_msg, FutureWarning) + if (not isinstance(max_depth, int)) or (max_depth < 1): raise ValueError( f"'max_depth' must be a positive integer, " f"got: {max_depth}" From 618fe76650c93cb958eaa5c2a6f1f1638ac808a2 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 30 Dec 2024 20:30:39 -0800 Subject: [PATCH 18/60] add random state argumment and update copyright --- python/cugraph/cugraph/sampling/node2vec_random_walks.py | 2 +- python/cugraph/cugraph/sampling/uniform_random_walks.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/sampling/node2vec_random_walks.py index 05c256c63a..530ce6ebd3 100644 --- a/python/cugraph/cugraph/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/sampling/node2vec_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py index 36ea7eeeec..0efef29bbd 100644 --- a/python/cugraph/cugraph/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -20,7 +20,6 @@ from cugraph.structure import Graph -import warnings from typing import Union, Tuple @@ -28,6 +27,7 @@ def uniform_random_walks( G: Graph, start_vertices: Union[int, list, cudf.Series, cudf.DataFrame] = None, max_depth: int = None, + random_state: int = None, ) -> Tuple[cudf.Series, cudf.Series, Union[None, int, cudf.Series]]: """ Compute uniform random walks for each nodes in 'start_vertices'. @@ -50,6 +50,9 @@ def uniform_random_walks( The max depth is relative to the number of edges hence the vertex_paths size is max_depth + 1. For instance, a 'max_depth' of 2 with only one seed will result in a vertex_path of size 3. + + random_state: int, optional + Random seed to use when making sampling calls. Returns From a1d004c454db169a9a375dc4f43dfe1d2d059061 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 15:42:31 -0800 Subject: [PATCH 19/60] update header file to take as input a random state --- cpp/include/cugraph_c/sampling_algorithms.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index ac029181ba..8c0040fff1 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -41,6 +41,7 @@ typedef struct { * @brief Compute uniform random walks * * @param [in] handle Handle for accessing resources + * @param [in,out] rng_state State of the random number generator, updated with each call * @param [in] graph Pointer to graph. NOTE: Graph might be modified if the storage * needs to be transposed * @param [in] start_vertices Array of source vertices @@ -52,6 +53,7 @@ typedef struct { */ cugraph_error_code_t cugraph_uniform_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, @@ -62,6 +64,7 @@ cugraph_error_code_t cugraph_uniform_random_walks( * @brief Compute biased random walks * * @param [in] handle Handle for accessing resources + * @param [in,out] rng_state State of the random number generator, updated with each call * @param [in] graph Pointer to graph. NOTE: Graph might be modified if the storage * needs to be transposed * @param [in] start_vertices Array of source vertices @@ -73,6 +76,7 @@ cugraph_error_code_t cugraph_uniform_random_walks( */ cugraph_error_code_t cugraph_biased_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, @@ -83,7 +87,7 @@ cugraph_error_code_t cugraph_biased_random_walks( * @brief Compute random walks using the node2vec framework. * * @param [in] handle Handle for accessing resources - * @param [in,out] rng_state State of the random number generator, updated with each call + * @param [in,out] rng_state State of the random number generator, updated with each call * @param [in] graph Pointer to graph. NOTE: Graph might be modified if the storage * needs to be transposed * @param [in] start_vertices Array of source vertices From bea2a2f5d92cab3586fbc81a4b09a6840819a738 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 15:46:08 -0800 Subject: [PATCH 20/60] add support for rng state as input --- cpp/src/c_api/random_walks.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp index e21090f6d0..c38db96baf 100644 --- a/cpp/src/c_api/random_walks.cpp +++ b/cpp/src/c_api/random_walks.cpp @@ -154,7 +154,6 @@ namespace { struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor { raft::handle_t const& handle_; - // FIXME: rng_state_ should be passed as a parameter cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr}; cugraph::c_api::cugraph_graph_t* graph_{nullptr}; cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr}; @@ -162,11 +161,13 @@ struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor { cugraph::c_api::cugraph_random_walk_result_t* result_{nullptr}; uniform_random_walks_functor(cugraph_resource_handle_t const* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, cugraph_type_erased_device_array_view_t const* start_vertices, size_t max_length) : abstract_functor(), handle_(*reinterpret_cast(handle)->handle_), + rng_state_(reinterpret_cast(rng_state)), graph_(reinterpret_cast(graph)), start_vertices_( reinterpret_cast( @@ -224,10 +225,6 @@ struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor { graph_view.local_vertex_partition_range_last(), false); - // FIXME: remove once rng_state passed as parameter - rng_state_ = reinterpret_cast( - new cugraph::c_api::cugraph_rng_state_t{raft::random::RngState{0}}); - auto [paths, weights] = cugraph::uniform_random_walks( handle_, rng_state_->rng_state_, @@ -261,7 +258,6 @@ struct uniform_random_walks_functor : public cugraph::c_api::abstract_functor { struct biased_random_walks_functor : public cugraph::c_api::abstract_functor { raft::handle_t const& handle_; - // FIXME: rng_state_ should be passed as a parameter cugraph::c_api::cugraph_rng_state_t* rng_state_{nullptr}; cugraph::c_api::cugraph_graph_t* graph_{nullptr}; cugraph::c_api::cugraph_type_erased_device_array_view_t const* start_vertices_{nullptr}; @@ -269,11 +265,13 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor { cugraph::c_api::cugraph_random_walk_result_t* result_{nullptr}; biased_random_walks_functor(cugraph_resource_handle_t const* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, cugraph_type_erased_device_array_view_t const* start_vertices, size_t max_length) : abstract_functor(), handle_(*reinterpret_cast(handle)->handle_), + rng_state_(reinterpret_cast(rng_state)), graph_(reinterpret_cast(graph)), start_vertices_( reinterpret_cast( @@ -293,8 +291,6 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor { // FIXME: Think about how to handle SG vice MG if constexpr (!cugraph::is_candidate::value) { unsupported(); - } else if constexpr (multi_gpu) { - unsupported(); } else { // random walks expects store_transposed == false if constexpr (store_transposed) { @@ -333,10 +329,6 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor { graph_view.local_vertex_partition_range_last(), false); - // FIXME: remove once rng_state passed as parameter - rng_state_ = reinterpret_cast( - new cugraph::c_api::cugraph_rng_state_t{raft::random::RngState{0}}); - auto [paths, weights] = cugraph::biased_random_walks( handle_, rng_state_->rng_state_, @@ -541,6 +533,7 @@ void cugraph_random_walk_result_free(cugraph_random_walk_result_t* result) cugraph_error_code_t cugraph_uniform_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, @@ -555,13 +548,14 @@ cugraph_error_code_t cugraph_uniform_random_walks( "vertex type of graph and start_vertices must match", *error); - uniform_random_walks_functor functor(handle, graph, start_vertices, max_length); + uniform_random_walks_functor functor(handle, rng_state, graph, start_vertices, max_length); return cugraph::c_api::run_algorithm(graph, functor, result, error); } cugraph_error_code_t cugraph_biased_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, @@ -576,7 +570,7 @@ cugraph_error_code_t cugraph_biased_random_walks( "vertex type of graph and start_vertices must match", *error); - biased_random_walks_functor functor(handle, graph, start_vertices, max_length); + biased_random_walks_functor functor(handle, rng_state, graph, start_vertices, max_length); return cugraph::c_api::run_algorithm(graph, functor, result, error); } From 755acc716b3bcb94ff538436ba34ff06073a635c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 15:51:58 -0800 Subject: [PATCH 21/60] update tests to support rng state as input --- cpp/tests/c_api/sg_random_walks_test.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/cpp/tests/c_api/sg_random_walks_test.c b/cpp/tests/c_api/sg_random_walks_test.c index 71c76f3f94..8d7328cba9 100644 --- a/cpp/tests/c_api/sg_random_walks_test.c +++ b/cpp/tests/c_api/sg_random_walks_test.c @@ -66,8 +66,12 @@ int generic_uniform_random_walks_test(vertex_t* h_src, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + ret_code = - cugraph_uniform_random_walks(handle, graph, d_start_view, max_depth, &result, &ret_error); + cugraph_uniform_random_walks(handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_random_walks failed."); @@ -188,9 +192,13 @@ int generic_biased_random_walks_test(vertex_t* h_src, handle, d_start_view, (byte_t*)h_start, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); - + + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + ret_code = - cugraph_biased_random_walks(handle, graph, d_start_view, max_depth, &result, &ret_error); + cugraph_biased_random_walks(handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "biased_random_walks failed."); From ef00fa5f83de00606beb24c1a99880f3536dc19f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 16:13:37 -0800 Subject: [PATCH 22/60] add biased random walks to the PLC API --- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 1 + python/pylibcugraph/pylibcugraph/__init__.py | 2 + .../pylibcugraph/_cugraph_c/algorithms.pxd | 2 + .../pylibcugraph/biased_random_walks.pyx | 150 ++++++++++++++++++ .../pylibcugraph/uniform_random_walks.pyx | 21 ++- 5 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 python/pylibcugraph/pylibcugraph/biased_random_walks.pyx diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index 44963bdc5e..d453c62001 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -58,6 +58,7 @@ set(cython_sources biased_neighbor_sample.pyx negative_sampling.pyx uniform_random_walks.pyx + biased_random_walks.pyx utils.pyx weakly_connected_components.pyx replicate_edgelist.pyx diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index cd5b23db1a..92c6459686 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -76,6 +76,8 @@ from pylibcugraph.uniform_random_walks import uniform_random_walks +from pylibcugraph.biased_random_walks import biased_random_walks + from pylibcugraph.betweenness_centrality import betweenness_centrality from pylibcugraph.induced_subgraph import induced_subgraph diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index a8e5bb7fdc..e043b7672b 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -363,6 +363,7 @@ cdef extern from "cugraph_c/algorithms.h": cdef cugraph_error_code_t \ cugraph_uniform_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, @@ -374,6 +375,7 @@ cdef extern from "cugraph_c/algorithms.h": cdef cugraph_error_code_t \ cugraph_biased_random_walks( const cugraph_resource_handle_t* handle, + cugraph_rng_state_t* rng_state, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* start_vertices, size_t max_length, diff --git a/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx b/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx new file mode 100644 index 0000000000..2f37de7e93 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx @@ -0,0 +1,150 @@ +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t + +from pylibcugraph._cugraph_c.resource_handle cimport ( + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_create, + cugraph_type_erased_device_array_view_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.algorithms cimport ( + cugraph_biased_random_walks, + cugraph_random_walk_result_t, + cugraph_random_walk_result_get_paths, + cugraph_random_walk_result_get_weights, + cugraph_random_walk_result_get_max_path_length, + cugraph_random_walk_result_free, +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph._cugraph_c.random cimport ( + cugraph_rng_state_t +) +from pylibcugraph.random cimport ( + CuGraphRandomState +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + assert_CAI_type, + get_c_type_from_numpy_type, +) + + +def biased_random_walks(ResourceHandle resource_handle, + _GPUGraph input_graph, + start_vertices, + size_t max_length, + random_state=None): + """ + Compute biased random walks for each nodes in 'start_vertices' + + Parameters + ---------- + resource_handle: ResourceHandle + Handle to the underlying device and host resources needed for + referencing data and running algorithms. + + input_graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + start_vertices: device array type + Device array containing the list of starting vertices from which + to run the biased random walk + + max_length: size_t + The maximum depth of the biased random walks + + random_state: int (Optional) + Random state to use when generating samples. Optional argument, + defaults to a hash of process id, time, and hostname. + (See pylibcugraph.random.CuGraphRandomState) + + Returns + ------- + A tuple containing two device arrays and an size_t which are respectively + the vertices path, the edge path weights and the maximum path length + + """ + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = input_graph.c_graph_ptr + + assert_CAI_type(start_vertices, "start_vertices") + + cdef cugraph_random_walk_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef uintptr_t cai_start_ptr = \ + start_vertices.__cuda_array_interface__["data"][0] + + cdef cugraph_type_erased_device_array_view_t* weights_ptr + + cdef cugraph_type_erased_device_array_view_t* start_ptr = \ + cugraph_type_erased_device_array_view_create( + cai_start_ptr, + len(start_vertices), + get_c_type_from_numpy_type(start_vertices.dtype)) + + cg_rng_state = CuGraphRandomState(resource_handle, random_state) + + cdef cugraph_rng_state_t* rng_state_ptr = \ + cg_rng_state.rng_state_ptr + + error_code = cugraph_biased_random_walks( + c_resource_handle_ptr, + rng_state_ptr, + c_graph_ptr, + start_ptr, + max_length, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_biased_random_walks") + + cdef cugraph_type_erased_device_array_view_t* path_ptr = \ + cugraph_random_walk_result_get_paths(result_ptr) + + if input_graph.weights_view_ptr is NULL and input_graph.weights_view_ptr_ptr is NULL: + cupy_weights = None + else: + weights_ptr = cugraph_random_walk_result_get_weights(result_ptr) + cupy_weights = copy_to_cupy_array(c_resource_handle_ptr, weights_ptr) + + max_path_length = \ + cugraph_random_walk_result_get_max_path_length(result_ptr) + + cupy_paths = copy_to_cupy_array(c_resource_handle_ptr, path_ptr) + + cugraph_random_walk_result_free(result_ptr) + cugraph_type_erased_device_array_view_free(start_ptr) + + return (cupy_paths, cupy_weights, max_path_length) diff --git a/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx b/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx index 677695f93a..95379254e4 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx @@ -45,6 +45,12 @@ from pylibcugraph.resource_handle cimport ( from pylibcugraph.graphs cimport ( _GPUGraph, ) +from pylibcugraph._cugraph_c.random cimport ( + cugraph_rng_state_t +) +from pylibcugraph.random cimport ( + CuGraphRandomState +) from pylibcugraph.utils cimport ( assert_success, copy_to_cupy_array, @@ -56,7 +62,8 @@ from pylibcugraph.utils cimport ( def uniform_random_walks(ResourceHandle resource_handle, _GPUGraph input_graph, start_vertices, - size_t max_length): + size_t max_length, + random_state=None): """ Compute uniform random walks for each nodes in 'start_vertices' @@ -75,7 +82,11 @@ def uniform_random_walks(ResourceHandle resource_handle, max_length: size_t The maximum depth of the uniform random walks - + + random_state: int (Optional) + Random state to use when generating samples. Optional argument, + defaults to a hash of process id, time, and hostname. + (See pylibcugraph.random.CuGraphRandomState) Returns ------- @@ -103,9 +114,15 @@ def uniform_random_walks(ResourceHandle resource_handle, cai_start_ptr, len(start_vertices), get_c_type_from_numpy_type(start_vertices.dtype)) + + cg_rng_state = CuGraphRandomState(resource_handle, random_state) + + cdef cugraph_rng_state_t* rng_state_ptr = \ + cg_rng_state.rng_state_ptr error_code = cugraph_uniform_random_walks( c_resource_handle_ptr, + rng_state_ptr, c_graph_ptr, start_ptr, max_length, From ae4833ca728b03a8959537722271407722fee97d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 16:34:22 -0800 Subject: [PATCH 23/60] add biased random walks to the python API --- python/cugraph/cugraph/__init__.py | 1 + python/cugraph/cugraph/sampling/__init__.py | 1 + .../cugraph/sampling/biased_random_walks.py | 124 ++++++++++++++++++ .../cugraph/sampling/uniform_random_walks.py | 47 +------ 4 files changed, 127 insertions(+), 46 deletions(-) create mode 100644 python/cugraph/cugraph/sampling/biased_random_walks.py diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index 3f960433ca..da26971cee 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -109,6 +109,7 @@ from cugraph.sampling import ( random_walks, uniform_random_walks, + biased_random_walks, rw_path, node2vec, uniform_neighbor_sample, diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index 723457d62c..16d68a9710 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -13,5 +13,6 @@ from cugraph.sampling.random_walks import random_walks, rw_path from cugraph.sampling.uniform_random_walks import uniform_random_walks +from cugraph.sampling.biased_random_walks import biased_random_walks from cugraph.sampling.node2vec import node2vec from cugraph.sampling.uniform_neighbor_sample import uniform_neighbor_sample diff --git a/python/cugraph/cugraph/sampling/biased_random_walks.py b/python/cugraph/cugraph/sampling/biased_random_walks.py new file mode 100644 index 0000000000..65b6524197 --- /dev/null +++ b/python/cugraph/cugraph/sampling/biased_random_walks.py @@ -0,0 +1,124 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import cudf +import cupy as cp +from pylibcugraph import ResourceHandle +from pylibcugraph import ( + biased_random_walks as pylibcugraph_biased_random_walks, +) + +from cugraph.structure import Graph + +from typing import Union, Tuple + + +def biased_random_walks( + G: Graph, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame] = None, + max_depth: int = None, + random_state: int = None, +) -> Tuple[cudf.Series, cudf.Series, Union[None, int, cudf.Series]]: + """ + Compute biased random walks for each nodes in 'start_vertices'. + Vertices with no outgoing edges will be padded with -1 and the corresponding + edge weights with 0.0. + + parameters + ---------- + G : cuGraph.Graph + The graph can be either directed or undirected. + + start_vertices : int or list or cudf.Series or cudf.DataFrame + A single node or a list or a cudf.Series of nodes from which to run + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame + + max_depth : int + The maximum depth of the random walks + + The max depth is relative to the number of edges hence the vertex_paths size + is max_depth + 1. For instance, a 'max_depth' of 2 with only one seed will + result in a vertex_path of size 3. + + random_state: int, optional + Random seed to use when making sampling calls. + + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + and + + max_path_length : int + The maximum path length. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> M = karate.get_edgelist(download=True) + >>> G = karate.get_graph() + >>> start_vertices = G.nodes()[:4] + >>> _, _, _ = cugraph.biased_random_walks(G, start_vertices, 3) + + """ + + if max_depth is None: + raise TypeError("must specify a 'max_depth'") + + if isinstance(start_vertices, int): + start_vertices = [start_vertices] + + if isinstance(start_vertices, list): + # Ensure the 'start_vertices' have the same dtype as the edge list. + # Failing to do that may produce erroneous results. + vertex_dtype = G.edgelist.edgelist_df.dtypes.iloc[0] + start_vertices = cudf.Series(start_vertices, dtype=vertex_dtype) + + if G.renumbered is True: + if isinstance(start_vertices, cudf.DataFrame): + start_vertices = G.lookup_internal_vertex_id( + start_vertices, start_vertices.columns + ) + else: + start_vertices = G.lookup_internal_vertex_id(start_vertices) + + vertex_paths, edge_wgt_paths, max_path_length = pylibcugraph_biased_random_walks( + resource_handle=ResourceHandle(), + input_graph=G._plc_graph, + start_vertices=start_vertices, + max_length=max_depth, + random_state=random_state + ) + + vertex_paths = cudf.Series(vertex_paths) + + if G.renumbered: + df_ = cudf.DataFrame() + df_["vertex_paths"] = vertex_paths + df_ = G.unrenumber(df_, "vertex_paths", preserve_order=True) + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + + edge_wgt_paths = cudf.Series(edge_wgt_paths) + + return ( + vertex_paths, + edge_wgt_paths, + max_path_length, + ) diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py index 0efef29bbd..160bdc9c9f 100644 --- a/python/cugraph/cugraph/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -104,6 +104,7 @@ def uniform_random_walks( input_graph=G._plc_graph, start_vertices=start_vertices, max_length=max_depth, + random_state=random_state ) vertex_paths = cudf.Series(vertex_paths) @@ -121,49 +122,3 @@ def uniform_random_walks( edge_wgt_paths, max_path_length, ) - - -def rw_path( - num_paths: int, sizes: cudf.Series -) -> Tuple[cudf.Series, cudf.Series, cudf.Series]: - """ - Retrieve more information on the obtained paths in case use_padding - is False. - - parameters - ---------- - num_paths: int - Number of paths in the random walk output. - - sizes: cudf.Series - Path size returned in random walk output. - - Returns - ------- - path_data : cudf.DataFrame - Dataframe containing vertex path offsets, edge weight offsets and - edge weight sizes for each path. - """ - - vertex_offsets = cudf.Series(0, dtype=sizes.dtype) - vertex_offsets = cudf.concat( - [vertex_offsets, sizes.cumsum()[:-1]], ignore_index=True - ) - weight_sizes = sizes - 1 - - weight_offsets = cudf.Series(0, dtype=sizes.dtype) - num_edges = vertex_offsets.diff()[1:] - 1 - - weight_offsets = cudf.concat( - [weight_offsets, num_edges.cumsum()], ignore_index=True - ) - # FIXME: CUDF bug. concatenating two series of type int32 but get a CUDF of - # type 'int64' have to cast the results - weight_offsets = weight_offsets.astype(sizes.dtype) - - path_data = cudf.DataFrame() - path_data["vertex_offsets"] = vertex_offsets - path_data["weight_sizes"] = weight_sizes - path_data["weight_offsets"] = weight_offsets - - return path_data[:num_paths] From 8314291b37b31e0479db4478c34825361cf49ba8 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 17:32:24 -0800 Subject: [PATCH 24/60] update docstrings and init file --- python/cugraph/cugraph/__init__.py | 1 + python/cugraph/cugraph/dask/__init__.py | 2 ++ .../cugraph/dask/sampling/node2vec_random_walks.py | 12 +++++++++--- python/cugraph/cugraph/sampling/__init__.py | 1 + 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index da26971cee..4f2c47cf41 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -110,6 +110,7 @@ random_walks, uniform_random_walks, biased_random_walks, + node2vec_random_walks, rw_path, node2vec, uniform_neighbor_sample, diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index b8753fc461..3b63e0ebff 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -28,6 +28,8 @@ from .components.connectivity import weakly_connected_components from .sampling.uniform_neighbor_sample import uniform_neighbor_sample from .sampling.random_walks import random_walks +from .sampling.uniform_random_walks import uniform_random_walks +from .sampling.biased_random_walks import biased_random_walks from .sampling.node2vec_random_walks import node2vec_random_walks from .centrality.eigenvector_centrality import eigenvector_centrality from .cores.core_number import core_number diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py index 18171eda62..3e42880a7d 100644 --- a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -71,8 +71,9 @@ def node2vec_random_walks( random_state=None ): """ - Computes random walks for each node in 'start_vertices', under the - node2vec sampling framework. + compute random walks under the node2vec sampling framework for each nodes in + 'start_vertices' and returns a padded result along with the maximum path length. + Vertices with no outgoing edges will be padded with -1. parameters ---------- @@ -112,6 +113,11 @@ def node2vec_random_walks( edge_weight_paths: dask_cudf.Series Series containing the edge weights of edges represented by the returned vertex_paths + + and + + max_path_length : int + The maximum path length. """ client = default_client() @@ -194,5 +200,5 @@ def node2vec_random_walks( ] ) - return ddf_vertex_paths, ddf_edge_wgt_paths + return ddf_vertex_paths, ddf_edge_wgt_paths, max_depth diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index 16d68a9710..495483d135 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -14,5 +14,6 @@ from cugraph.sampling.random_walks import random_walks, rw_path from cugraph.sampling.uniform_random_walks import uniform_random_walks from cugraph.sampling.biased_random_walks import biased_random_walks +from cugraph.sampling.node2vec_random_walks import node2vec_random_walks from cugraph.sampling.node2vec import node2vec from cugraph.sampling.uniform_neighbor_sample import uniform_neighbor_sample From 1603bcd7747a306a5a7dcadc8f2068e535f7eeb3 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 17:34:00 -0800 Subject: [PATCH 25/60] fix typo --- python/cugraph/cugraph/sampling/node2vec_random_walks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/sampling/node2vec_random_walks.py index 530ce6ebd3..3f8a6bdb66 100644 --- a/python/cugraph/cugraph/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/sampling/node2vec_random_walks.py @@ -13,7 +13,7 @@ from pylibcugraph import ( ResourceHandle, - node2vec as pylibcugraph_node2vec_random_walks, + node2vec_random_walks as pylibcugraph_node2vec_random_walks, ) import warnings @@ -41,7 +41,7 @@ def ensure_valid_dtype(input_graph, start_vertices): return start_vertices -def node2vec(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_state=None): +def node2vec_random_walks(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_state=None): """ Computes random walks for each node in 'start_vertices', under the node2vec sampling framework. From 0a03b290542538dd4ab1c02832404f05a4498935 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 17:36:29 -0800 Subject: [PATCH 26/60] update copyright --- python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py index 3e42880a7d..2ba3a2d238 100644 --- a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From 88e405df171c748da7973ec2ae0b2f77aad55c2f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 31 Dec 2024 17:39:11 -0800 Subject: [PATCH 27/60] add mg implementation of biased and uniform random walks --- .../dask/sampling/biased_random_walks.py | 171 ++++++++++++++++++ .../dask/sampling/uniform_random_walks.py | 171 ++++++++++++++++++ 2 files changed, 342 insertions(+) create mode 100644 python/cugraph/cugraph/dask/sampling/biased_random_walks.py create mode 100644 python/cugraph/cugraph/dask/sampling/uniform_random_walks.py diff --git a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py new file mode 100644 index 0000000000..a4dab3578a --- /dev/null +++ b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py @@ -0,0 +1,171 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dask.distributed import wait, default_client +import dask_cudf +import cudf +import operator as op +from cugraph.dask.common.part_utils import ( + persist_dask_df_equal_parts_per_worker, +) + +from pylibcugraph import ResourceHandle + +from pylibcugraph import ( + biased_random_walks as pylibcugraph_biased_random_walks, +) + +from cugraph.dask.comms import comms as Comms + + +def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): + """ + Creates cudf Series from cupy arrays from pylibcugraph wrapper + """ + + if is_vertex_paths and len(cp_paths) > 0: + if number_map.implementation.numbered: + df_ = cudf.DataFrame() + df_["vertex_paths"] = cp_paths + df_ = number_map.unrenumber( + df_, "vertex_paths", preserve_order=True + ).compute() + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + + return vertex_paths + + return cudf.Series(cp_paths) + + +def _call_plc_biased_random_walks(sID, mg_graph_x, st_x, max_depth, random_state): + + return pylibcugraph_biased_random_walks( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + input_graph=mg_graph_x, + start_vertices=st_x, + max_length=max_depth, + random_state=random_state, + ) + + +def biased_random_walks( + input_graph, + start_vertices=None, + max_depth=None, + random_state=None +): + """ + compute random walks under the biased sampling framework for each nodes in + 'start_vertices' and returns a padded result along with the maximum path length. + Vertices with no outgoing edges will be padded with -1. + + parameters + ---------- + input_graph : cuGraph.Graph + The graph can be either directed or undirected. + + start_vertices : int or list or cudf.Series or cudf.DataFrame + A single node or a list or a cudf.Series of nodes from which to run + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame + + max_depth : int + The maximum depth of the random walks + + random_state: int, optional + Random seed to use when making sampling calls. + + + Returns + ------- + vertex_paths : dask_cudf.Series or dask_cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: dask_cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + max_path_length : int + The maximum path length + """ + client = default_client() + if isinstance(start_vertices, int): + start_vertices = [start_vertices] + + if isinstance(start_vertices, list): + start_vertices = cudf.Series(start_vertices) + + # start_vertices uses "external" vertex IDs, but if the graph has been + # renumbered, the start vertex IDs must also be renumbered. + if input_graph.renumbered: + # FIXME: This should match start_vertices type to the renumbered df type + # but verify that. If not retrieve the type and cast it when creating + # the dask_cudf from a cudf + start_vertices = input_graph.lookup_internal_vertex_id(start_vertices).compute() + start_vertices_type = input_graph.edgelist.edgelist_df.dtypes[0] + else: + # FIXME: Get the 'src' column names instead and retrieve the type + start_vertices_type = input_graph.input_df.dtypes.iloc[0] + start_vertices = dask_cudf.from_cudf( + start_vertices, npartitions=min(input_graph._npartitions, len(start_vertices)) + ) + start_vertices = start_vertices.astype(start_vertices_type) + start_vertices = persist_dask_df_equal_parts_per_worker( + start_vertices, client, return_type="dict" + ) + + result = [ + client.submit( + _call_plc_biased_random_walks, + Comms.get_session_id(), + input_graph._plc_graph[w], + start_v[0] if start_v else cudf.Series(dtype=start_vertices_type), + max_depth, + random_state=random_state, + workers=[w], + allow_other_workers=False, + ) + for w, start_v in start_vertices.items() + ] + + wait(result) + + result_vertex_paths = [client.submit(op.getitem, f, 0) for f in result] + result_edge_wgt_paths = [client.submit(op.getitem, f, 1) for f in result] + + cudf_vertex_paths = [ + client.submit(convert_to_cudf, cp_vertex_paths, input_graph.renumber_map, True) + for cp_vertex_paths in result_vertex_paths + ] + + cudf_edge_wgt_paths = [ + client.submit(convert_to_cudf, cp_edge_wgt_paths) + for cp_edge_wgt_paths in result_edge_wgt_paths + ] + + wait([cudf_vertex_paths, cudf_edge_wgt_paths]) + + ddf_vertex_paths = dask_cudf.from_delayed(cudf_vertex_paths).persist() + ddf_edge_wgt_paths = dask_cudf.from_delayed(cudf_edge_wgt_paths).persist() + wait([ddf_vertex_paths, ddf_edge_wgt_paths]) + + # Wait until the inactive futures are released + wait( + [ + (r.release(), c_v.release(), c_e.release()) + for r, c_v, c_e in zip(result, cudf_vertex_paths, cudf_edge_wgt_paths) + ] + ) + + return ddf_vertex_paths, ddf_edge_wgt_paths, max_depth diff --git a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py new file mode 100644 index 0000000000..ba571a03e8 --- /dev/null +++ b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py @@ -0,0 +1,171 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dask.distributed import wait, default_client +import dask_cudf +import cudf +import operator as op +from cugraph.dask.common.part_utils import ( + persist_dask_df_equal_parts_per_worker, +) + +from pylibcugraph import ResourceHandle + +from pylibcugraph import ( + uniform_random_walks as pylibcugraph_uniform_random_walks, +) + +from cugraph.dask.comms import comms as Comms + + +def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): + """ + Creates cudf Series from cupy arrays from pylibcugraph wrapper + """ + + if is_vertex_paths and len(cp_paths) > 0: + if number_map.implementation.numbered: + df_ = cudf.DataFrame() + df_["vertex_paths"] = cp_paths + df_ = number_map.unrenumber( + df_, "vertex_paths", preserve_order=True + ).compute() + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + + return vertex_paths + + return cudf.Series(cp_paths) + + +def _call_plc_uniform_random_walks(sID, mg_graph_x, st_x, max_depth, random_state): + + return pylibcugraph_uniform_random_walks( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + input_graph=mg_graph_x, + start_vertices=st_x, + max_length=max_depth, + random_state=random_state, + ) + + +def uniform_random_walks( + input_graph, + start_vertices=None, + max_depth=None, + random_state=None +): + """ + compute random walks under the uniform sampling framework for each nodes in + 'start_vertices' and returns a padded result along with the maximum path length. + Vertices with no outgoing edges will be padded with -1. + + parameters + ---------- + input_graph : cuGraph.Graph + The graph can be either directed or undirected. + + start_vertices : int or list or cudf.Series or cudf.DataFrame + A single node or a list or a cudf.Series of nodes from which to run + the random walks. In case of multi-column vertices it should be + a cudf.DataFrame + + max_depth : int + The maximum depth of the random walks + + random_state: int, optional + Random seed to use when making sampling calls. + + + Returns + ------- + vertex_paths : dask_cudf.Series or dask_cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: dask_cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + max_path_length : int + The maximum path length + """ + client = default_client() + if isinstance(start_vertices, int): + start_vertices = [start_vertices] + + if isinstance(start_vertices, list): + start_vertices = cudf.Series(start_vertices) + + # start_vertices uses "external" vertex IDs, but if the graph has been + # renumbered, the start vertex IDs must also be renumbered. + if input_graph.renumbered: + # FIXME: This should match start_vertices type to the renumbered df type + # but verify that. If not retrieve the type and cast it when creating + # the dask_cudf from a cudf + start_vertices = input_graph.lookup_internal_vertex_id(start_vertices).compute() + start_vertices_type = input_graph.edgelist.edgelist_df.dtypes[0] + else: + # FIXME: Get the 'src' column names instead and retrieve the type + start_vertices_type = input_graph.input_df.dtypes.iloc[0] + start_vertices = dask_cudf.from_cudf( + start_vertices, npartitions=min(input_graph._npartitions, len(start_vertices)) + ) + start_vertices = start_vertices.astype(start_vertices_type) + start_vertices = persist_dask_df_equal_parts_per_worker( + start_vertices, client, return_type="dict" + ) + + result = [ + client.submit( + _call_plc_uniform_random_walks, + Comms.get_session_id(), + input_graph._plc_graph[w], + start_v[0] if start_v else cudf.Series(dtype=start_vertices_type), + max_depth, + random_state=random_state, + workers=[w], + allow_other_workers=False, + ) + for w, start_v in start_vertices.items() + ] + + wait(result) + + result_vertex_paths = [client.submit(op.getitem, f, 0) for f in result] + result_edge_wgt_paths = [client.submit(op.getitem, f, 1) for f in result] + + cudf_vertex_paths = [ + client.submit(convert_to_cudf, cp_vertex_paths, input_graph.renumber_map, True) + for cp_vertex_paths in result_vertex_paths + ] + + cudf_edge_wgt_paths = [ + client.submit(convert_to_cudf, cp_edge_wgt_paths) + for cp_edge_wgt_paths in result_edge_wgt_paths + ] + + wait([cudf_vertex_paths, cudf_edge_wgt_paths]) + + ddf_vertex_paths = dask_cudf.from_delayed(cudf_vertex_paths).persist() + ddf_edge_wgt_paths = dask_cudf.from_delayed(cudf_edge_wgt_paths).persist() + wait([ddf_vertex_paths, ddf_edge_wgt_paths]) + + # Wait until the inactive futures are released + wait( + [ + (r.release(), c_v.release(), c_e.release()) + for r, c_v, c_e in zip(result, cudf_vertex_paths, cudf_edge_wgt_paths) + ] + ) + + return ddf_vertex_paths, ddf_edge_wgt_paths, max_depth From c8265e7cbb385d788ae862ccf11e9ab827a96b19 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 2 Jan 2025 08:05:50 -0800 Subject: [PATCH 28/60] update docstrings --- python/cugraph/cugraph/sampling/node2vec.py | 4 ++-- python/cugraph/cugraph/sampling/random_walks.py | 12 ++++++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index b5c93873fd..2a5c0f9389 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -113,8 +113,8 @@ def node2vec(G, start_vertices, max_depth=1, compress_result=True, p=1.0, q=1.0) """ warning_msg = ( - "random_walks is deprecated and will be removed " - "in the next release in favor of uniform_random_walks" + "node2vec is deprecated and will be removed " + "in the next release in favor of node2vec_random_walks" ) warnings.warn(warning_msg, FutureWarning) diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index f0177d92c4..eaf0581724 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -97,10 +97,18 @@ def random_walks( use_padding : bool, optional (default=False) If True, padded paths are returned else coalesced paths are returned. + Deprecated: only padded paths will be returned in the results + legacy_result_type : bool, optional (default=True) If True, will return a tuple of vertex_paths, edge_weight_paths and - sizes. If False, will return a tuple of vertex_paths, vertex_paths and - max_path_length + sizes where the 'max_depth' is proportional to the number of vertices. + If False, will return a tuple of vertex_paths, vertex_paths and + max_path_length where the 'max_depth' is propotional to the number of + edges. + + Deprecated: only padded paths will be returned where the 'max_depth' + is proportional to the number of edges instead of the number of + vertices when 'legacy_result_type' is 'True'. Returns ------- From 0a4d29baf994d6632f59db8a1cc3246b94cce464 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 2 Jan 2025 09:52:18 -0800 Subject: [PATCH 29/60] deprecate legacy implementation --- python/cugraph/cugraph/dask/sampling/random_walks.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/cugraph/cugraph/dask/sampling/random_walks.py b/python/cugraph/cugraph/dask/sampling/random_walks.py index 99996153d3..07dfe93b16 100644 --- a/python/cugraph/cugraph/dask/sampling/random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/random_walks.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +import warnings from dask.distributed import wait, default_client import dask_cudf import cudf @@ -106,6 +107,13 @@ def random_walks( max_path_length : int The maximum path length """ + + warning_msg = ( + "random_walks is deprecated and will be removed " + "in the next release in favor of uniform_random_walks" + ) + warnings.warn(warning_msg, FutureWarning) + client = default_client() if isinstance(start_vertices, int): start_vertices = [start_vertices] From 4e0eff9e7ab68e3f3328779104aa2c176e974686 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 2 Jan 2025 13:13:30 -0800 Subject: [PATCH 30/60] remove unused import --- python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx | 1 - 1 file changed, 1 deletion(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx index d1c4760d1f..59e7bd96c4 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx @@ -17,7 +17,6 @@ from libc.stdint cimport uintptr_t from pylibcugraph._cugraph_c.resource_handle cimport ( - bool_t, cugraph_resource_handle_t, ) from pylibcugraph._cugraph_c.error cimport ( From 7c85269f8d3d0774d05741e8e593b5b4cd87fc2c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 11:27:09 -0800 Subject: [PATCH 31/60] update MG C tests --- cpp/tests/c_api/mg_random_walks_test.c | 27 +++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/cpp/tests/c_api/mg_random_walks_test.c b/cpp/tests/c_api/mg_random_walks_test.c index 13252e0f1d..525d340148 100644 --- a/cpp/tests/c_api/mg_random_walks_test.c +++ b/cpp/tests/c_api/mg_random_walks_test.c @@ -62,8 +62,15 @@ int generic_uniform_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + int rank = cugraph_resource_handle_get_rank(handle); + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); + ret_code = - cugraph_uniform_random_walks(handle, graph, d_start_view, max_depth, &result, &ret_error); + cugraph_uniform_random_walks( + handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_random_walks failed."); @@ -173,7 +180,14 @@ int generic_biased_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); - ret_code = cugraph_biased_random_walks(handle, graph, d_start_view, FALSE, &result, &ret_error); + int rank = cugraph_resource_handle_get_rank(handle); + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); + + ret_code = cugraph_biased_random_walks( + handle, rng_state, graph, d_start_view, FALSE, &result, &ret_error); #if 1 TEST_ASSERT(test_ret_value, ret_code != CUGRAPH_SUCCESS, "biased_random_walks should have failed") @@ -277,8 +291,15 @@ int generic_node2vec_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); + int rank = cugraph_resource_handle_get_rank(handle); + cugraph_rng_state_t* rng_state; + ret_code = cugraph_rng_state_create(handle, rank, &rng_state, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); + TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); + ret_code = - cugraph_node2vec_random_walks(handle, graph, d_start_view, FALSE, p, q, &result, &ret_error); + cugraph_node2vec_random_walks( + handle, rng_state, graph, d_start_view, FALSE, p, q, &result, &ret_error); #if 1 TEST_ASSERT( From 067d53bbd8891232c0835047f5ef2998274e7af4 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 12:30:26 -0800 Subject: [PATCH 32/60] remove unused variable and update the number of arrays passed at the graph creation --- cpp/tests/c_api/mg_test_utils.cpp | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index f96be61468..be6e869cf5 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -158,8 +158,6 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, rank = cugraph_resource_handle_get_rank(handle); - size_t original_num_edges = num_edges; - if (rank != 0) num_edges = 0; ret_code = @@ -191,6 +189,7 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, handle, wgt_view, (byte_t*)h_wgt, ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed."); + printf("\n in 'mg_test_utils'\n"); ret_code = cugraph_graph_create_mg(handle, &properties, NULL, @@ -200,14 +199,15 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, NULL, NULL, store_transposed, - original_num_edges, // UNUSED - FALSE, + size_t{1}, // num_arrays FALSE, FALSE, FALSE, + TRUE, p_graph, ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); + printf("\n completed 'mg_test_utils'\n"); cugraph_type_erased_device_array_view_free(wgt_view); cugraph_type_erased_device_array_view_free(dst_view); @@ -256,8 +256,6 @@ extern "C" int create_mg_test_graph_double(const cugraph_resource_handle_t* hand rank = cugraph_resource_handle_get_rank(handle); - size_t original_num_edges = num_edges; - if (rank != 0) num_edges = 0; ret_code = @@ -298,7 +296,7 @@ extern "C" int create_mg_test_graph_double(const cugraph_resource_handle_t* hand NULL, NULL, store_transposed, - original_num_edges, // UNUSED + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -349,8 +347,6 @@ extern "C" int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_ rank = cugraph_resource_handle_get_rank(handle); - size_t original_num_edges = num_edges; - if (rank != 0) num_edges = 0; ret_code = @@ -390,7 +386,7 @@ extern "C" int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_ &idx_view, NULL, store_transposed, - original_num_edges, // UNUSED + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -449,8 +445,6 @@ extern "C" int create_mg_test_graph_with_properties(const cugraph_resource_handl rank = cugraph_resource_handle_get_rank(handle); - size_t original_num_edges = num_edges; - if (rank != 0) num_edges = 0; ret_code = @@ -517,7 +511,7 @@ extern "C" int create_mg_test_graph_with_properties(const cugraph_resource_handl &idx_view, &type_view, store_transposed, - original_num_edges, // UNUSED + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -581,8 +575,6 @@ int create_mg_test_graph_new(const cugraph_resource_handle_t* handle, rank = cugraph_resource_handle_get_rank(handle); - size_t original_num_edges = num_edges; - if (rank != 0) num_edges = 0; ret_code = @@ -650,7 +642,7 @@ int create_mg_test_graph_new(const cugraph_resource_handle_t* handle, &edge_id_view, &edge_type_view, store_transposed, - original_num_edges, // UNUSED + size_t{1}, // num_arrays FALSE, FALSE, FALSE, From b94a6eaadd131f6157342495bd9cda12505ef13f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 16:35:07 -0800 Subject: [PATCH 33/60] update copyright and remove debug print --- cpp/tests/c_api/mg_test_utils.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index be6e869cf5..4767ac9853 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -189,7 +189,6 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, handle, wgt_view, (byte_t*)h_wgt, ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "wgt copy_from_host failed."); - printf("\n in 'mg_test_utils'\n"); ret_code = cugraph_graph_create_mg(handle, &properties, NULL, @@ -207,7 +206,6 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, p_graph, ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "graph creation failed."); - printf("\n completed 'mg_test_utils'\n"); cugraph_type_erased_device_array_view_free(wgt_view); cugraph_type_erased_device_array_view_free(dst_view); From e83722e0720654ae8b390b92aae2a7863e4f0a66 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 16:36:39 -0800 Subject: [PATCH 34/60] fix renumbering bug --- cpp/src/c_api/random_walks.cpp | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/cpp/src/c_api/random_walks.cpp b/cpp/src/c_api/random_walks.cpp index c38db96baf..71fa7fdeb9 100644 --- a/cpp/src/c_api/random_walks.cpp +++ b/cpp/src/c_api/random_walks.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -340,8 +340,13 @@ struct biased_random_walks_functor : public cugraph::c_api::abstract_functor { // // Need to unrenumber the vertices in the resulting paths // - cugraph::unrenumber_local_int_vertices( - handle_, paths.data(), paths.size(), number_map->data(), 0, paths.size() - 1, false); + cugraph::unrenumber_int_vertices( + handle_, + paths.data(), + paths.size(), + number_map->data(), + graph_view.vertex_partition_range_lasts(), + false); result_ = new cugraph::c_api::cugraph_random_walk_result_t{ false, @@ -451,8 +456,13 @@ struct node2vec_random_walks_functor : public cugraph::c_api::abstract_functor { // // Need to unrenumber the vertices in the resulting paths // - cugraph::unrenumber_local_int_vertices( - handle_, paths.data(), paths.size(), number_map->data(), 0, paths.size(), false); + cugraph::unrenumber_int_vertices( + handle_, + paths.data(), + paths.size(), + number_map->data(), + graph_view.vertex_partition_range_lasts(), + false); result_ = new cugraph::c_api::cugraph_random_walk_result_t{ false, From 2c1a034ae1f2c7ed77fbbbb54fa27897a2342e30 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 16:39:01 -0800 Subject: [PATCH 35/60] enable MG tests and fix bugs --- cpp/tests/c_api/mg_random_walks_test.c | 111 +++++++++++++++---------- 1 file changed, 66 insertions(+), 45 deletions(-) diff --git a/cpp/tests/c_api/mg_random_walks_test.c b/cpp/tests/c_api/mg_random_walks_test.c index 525d340148..d5a02ba242 100644 --- a/cpp/tests/c_api/mg_random_walks_test.c +++ b/cpp/tests/c_api/mg_random_walks_test.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -128,6 +128,7 @@ int generic_uniform_random_walks_test(const cugraph_resource_handle_t* handle, "uniform_random_walks found no edge when an edge exists"); } } else { + //printf("\na_ = %f, b_ = %f\n", M[h_result_verts[src_index]][h_result_verts[dst_index]], h_result_wgts[i * max_depth + j]); TEST_ASSERT(test_ret_value, M[h_result_verts[src_index]][h_result_verts[dst_index]] == h_result_wgts[i * max_depth + j], @@ -186,14 +187,12 @@ int generic_biased_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - ret_code = cugraph_biased_random_walks( - handle, rng_state, graph, d_start_view, FALSE, &result, &ret_error); + ret_code = + cugraph_biased_random_walks( + handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); -#if 1 - TEST_ASSERT(test_ret_value, ret_code != CUGRAPH_SUCCESS, "biased_random_walks should have failed") -#else TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "biased_random_walks failed."); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_random_walks failed."); cugraph_type_erased_device_array_view_t* verts; cugraph_type_erased_device_array_view_t* wgts; @@ -205,10 +204,10 @@ int generic_biased_random_walks_test(const cugraph_resource_handle_t* handle, size_t wgts_size = cugraph_type_erased_device_array_view_size(wgts); vertex_t h_result_verts[verts_size]; - vertex_t h_result_wgts[wgts_size]; + weight_t h_result_wgts[wgts_size]; - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_verts, verts, &ret_error); + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_result_verts, verts, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); ret_code = cugraph_type_erased_device_array_view_copy_to_host( @@ -228,24 +227,36 @@ int generic_biased_random_walks_test(const cugraph_resource_handle_t* handle, M[h_src[i]][h_dst[i]] = h_wgt[i]; TEST_ASSERT(test_ret_value, - cugraph_random_walk_result_get_max_path_length() == max_depth, + cugraph_random_walk_result_get_max_path_length(result) == max_depth, "path length does not match"); for (int i = 0; (i < num_starts) && (test_ret_value == 0); ++i) { - TEST_ASSERT(test_ret_value, - M[h_start[i]][h_result_verts[i * (max_depth + 1)]] == h_result_wgts[i * max_depth], - "biased_random_walks got edge that doesn't exist"); - for (size_t j = 1; j < cugraph_random_walk_result_get_max_path_length(); ++j) - TEST_ASSERT( - test_ret_value, - M[h_start[i * (max_depth + 1) + j - 1]][h_result_verts[i * (max_depth + 1) + j]] == - h_result_wgts[i * max_depth + j - 1], - "biased_random_walks got edge that doesn't exist"); + TEST_ASSERT( + test_ret_value, h_start[i] == h_result_verts[i * (max_depth + 1)], "start of path not found"); + for (size_t j = 0; j < max_depth; ++j) { + int src_index = i * (max_depth + 1) + j; + int dst_index = src_index + 1; + if (h_result_verts[dst_index] < 0) { + if (h_result_verts[src_index] >= 0) { + int departing_count = 0; + for (int k = 0; k < num_vertices; ++k) { + // edges with weight/bias value less than 0 will not be sampled. + if (M[h_result_verts[src_index]][k] > 0) departing_count++; + } + TEST_ASSERT(test_ret_value, + departing_count == 0, + "biased_random_walks found no edge when an edge exists"); + } + } else { + TEST_ASSERT(test_ret_value, + M[h_result_verts[src_index]][h_result_verts[dst_index]] == + h_result_wgts[i * max_depth + j], + "biased_random_walks got edge that doesn't exist"); + } + } } cugraph_random_walk_result_free(result); -#endif - cugraph_graph_free(graph); cugraph_error_free(ret_error); @@ -299,12 +310,9 @@ int generic_node2vec_random_walks_test(const cugraph_resource_handle_t* handle, ret_code = cugraph_node2vec_random_walks( - handle, rng_state, graph, d_start_view, FALSE, p, q, &result, &ret_error); + handle, rng_state, graph, d_start_view, max_depth, p, q, &result, &ret_error); + -#if 1 - TEST_ASSERT( - test_ret_value, ret_code != CUGRAPH_SUCCESS, "node2vec_random_walks should have failed") -#else TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "node2vec_random_walks failed."); @@ -318,10 +326,10 @@ int generic_node2vec_random_walks_test(const cugraph_resource_handle_t* handle, size_t wgts_size = cugraph_type_erased_device_array_view_size(wgts); vertex_t h_result_verts[verts_size]; - vertex_t h_result_wgts[wgts_size]; + weight_t h_result_wgts[wgts_size]; ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_verts, verts, &ret_error); + cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_result_verts, verts, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); ret_code = cugraph_type_erased_device_array_view_copy_to_host( @@ -341,24 +349,35 @@ int generic_node2vec_random_walks_test(const cugraph_resource_handle_t* handle, M[h_src[i]][h_dst[i]] = h_wgt[i]; TEST_ASSERT(test_ret_value, - cugraph_random_walk_result_get_max_path_length() == max_depth, + cugraph_random_walk_result_get_max_path_length(result) == max_depth, "path length does not match"); for (int i = 0; (i < num_starts) && (test_ret_value == 0); ++i) { - TEST_ASSERT(test_ret_value, - M[h_start[i]][h_result_verts[i * (max_depth + 1)]] == h_result_wgts[i * max_depth], - "node2vec_random_walks got edge that doesn't exist"); - for (size_t j = 1; j < cugraph_random_walk_result_get_max_path_length(); ++j) - TEST_ASSERT( - test_ret_value, - M[h_start[i * (max_depth + 1) + j - 1]][h_result_verts[i * (max_depth + 1) + j]] == - h_result_wgts[i * max_depth + j - 1], - "node2vec_random_walks got edge that doesn't exist"); + TEST_ASSERT( + test_ret_value, h_start[i] == h_result_verts[i * (max_depth + 1)], "start of path not found"); + for (size_t j = 0; j < max_depth; ++j) { + int src_index = i * (max_depth + 1) + j; + int dst_index = src_index + 1; + if (h_result_verts[dst_index] < 0) { + if (h_result_verts[src_index] >= 0) { + int departing_count = 0; + for (int k = 0; k < num_vertices; ++k) { + if (M[h_result_verts[src_index]][k] >= 0) departing_count++; + } + TEST_ASSERT(test_ret_value, + departing_count == 0, + "node2vec_random_walks found no edge when an edge exists"); + } + } else { + TEST_ASSERT(test_ret_value, + M[h_result_verts[src_index]][h_result_verts[dst_index]] == + h_result_wgts[i * max_depth + j], + "node2vec_random_walks got edge that doesn't exist"); + } + } } cugraph_random_walk_result_free(result); -#endif - cugraph_graph_free(graph); cugraph_error_free(ret_error); @@ -386,14 +405,15 @@ int test_biased_random_walks(const cugraph_resource_handle_t* handle) size_t num_edges = 8; size_t num_vertices = 6; size_t num_starts = 2; + size_t max_depth = 3; vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - weight_t wgt[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + weight_t wgt[] = {0, 1, 2, 3, 4, 5, 6, 7}; vertex_t start[] = {2, 2}; return generic_biased_random_walks_test( - handle, src, dst, wgt, num_vertices, num_edges, start, num_starts, FALSE, FALSE); + handle, src, dst, wgt, num_vertices, num_edges, start, num_starts, max_depth, FALSE); } int test_node2vec_random_walks(const cugraph_resource_handle_t* handle) @@ -401,17 +421,18 @@ int test_node2vec_random_walks(const cugraph_resource_handle_t* handle) size_t num_edges = 8; size_t num_vertices = 6; size_t num_starts = 2; + size_t max_depth = 3; vertex_t src[] = {0, 1, 1, 2, 2, 2, 3, 4}; vertex_t dst[] = {1, 3, 4, 0, 1, 3, 5, 5}; - weight_t wgt[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + weight_t wgt[] = {0, 1, 2, 3, 4, 5, 6, 7}; vertex_t start[] = {2, 2}; weight_t p = 5; weight_t q = 8; return generic_node2vec_random_walks_test( - handle, src, dst, wgt, num_vertices, num_edges, start, num_starts, p, q, FALSE, FALSE); + handle, src, dst, wgt, num_vertices, num_edges, start, num_starts, p, q, max_depth, FALSE); } int main(int argc, char** argv) From d521dfc8a40f229dbad50ac09b4efe470459149e Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 16:41:56 -0800 Subject: [PATCH 36/60] fix style --- cpp/include/cugraph_c/sampling_algorithms.h | 2 +- cpp/tests/c_api/mg_random_walks_test.c | 23 ++++++++----------- cpp/tests/c_api/mg_test_utils.cpp | 10 ++++---- cpp/tests/c_api/sg_random_walks_test.c | 12 +++++----- python/cugraph/cugraph/__init__.py | 2 +- python/cugraph/cugraph/dask/__init__.py | 2 +- .../dask/sampling/biased_random_walks.py | 9 +++----- .../dask/sampling/node2vec_random_walks.py | 22 +++++++----------- .../cugraph/dask/sampling/random_walks.py | 8 +++---- .../dask/sampling/uniform_random_walks.py | 9 +++----- python/cugraph/cugraph/sampling/__init__.py | 2 +- .../cugraph/sampling/biased_random_walks.py | 6 ++--- python/cugraph/cugraph/sampling/node2vec.py | 8 +++---- .../cugraph/sampling/node2vec_random_walks.py | 11 +++++---- .../cugraph/cugraph/sampling/random_walks.py | 22 ++++++++---------- .../cugraph/sampling/uniform_random_walks.py | 6 ++--- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 2 +- python/pylibcugraph/pylibcugraph/__init__.py | 2 +- .../pylibcugraph/_cugraph_c/algorithms.pxd | 4 ++-- .../pylibcugraph/biased_random_walks.pyx | 8 +++---- .../pylibcugraph/node2vec_random_walks.pyx | 6 ++--- .../pylibcugraph/uniform_random_walks.pyx | 8 +++---- 22 files changed, 84 insertions(+), 100 deletions(-) diff --git a/cpp/include/cugraph_c/sampling_algorithms.h b/cpp/include/cugraph_c/sampling_algorithms.h index 8c0040fff1..05639224aa 100644 --- a/cpp/include/cugraph_c/sampling_algorithms.h +++ b/cpp/include/cugraph_c/sampling_algorithms.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/tests/c_api/mg_random_walks_test.c b/cpp/tests/c_api/mg_random_walks_test.c index d5a02ba242..c62725029a 100644 --- a/cpp/tests/c_api/mg_random_walks_test.c +++ b/cpp/tests/c_api/mg_random_walks_test.c @@ -68,9 +68,8 @@ int generic_uniform_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - ret_code = - cugraph_uniform_random_walks( - handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); + ret_code = cugraph_uniform_random_walks( + handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_random_walks failed."); @@ -128,7 +127,8 @@ int generic_uniform_random_walks_test(const cugraph_resource_handle_t* handle, "uniform_random_walks found no edge when an edge exists"); } } else { - //printf("\na_ = %f, b_ = %f\n", M[h_result_verts[src_index]][h_result_verts[dst_index]], h_result_wgts[i * max_depth + j]); + // printf("\na_ = %f, b_ = %f\n", M[h_result_verts[src_index]][h_result_verts[dst_index]], + // h_result_wgts[i * max_depth + j]); TEST_ASSERT(test_ret_value, M[h_result_verts[src_index]][h_result_verts[dst_index]] == h_result_wgts[i * max_depth + j], @@ -187,9 +187,8 @@ int generic_biased_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - ret_code = - cugraph_biased_random_walks( - handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); + ret_code = cugraph_biased_random_walks( + handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_random_walks failed."); @@ -308,10 +307,8 @@ int generic_node2vec_random_walks_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); TEST_ALWAYS_ASSERT(ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - ret_code = - cugraph_node2vec_random_walks( - handle, rng_state, graph, d_start_view, max_depth, p, q, &result, &ret_error); - + ret_code = cugraph_node2vec_random_walks( + handle, rng_state, graph, d_start_view, max_depth, p, q, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "node2vec_random_walks failed."); @@ -328,8 +325,8 @@ int generic_node2vec_random_walks_test(const cugraph_resource_handle_t* handle, vertex_t h_result_verts[verts_size]; weight_t h_result_wgts[wgts_size]; - ret_code = - cugraph_type_erased_device_array_view_copy_to_host(handle, (byte_t*)h_result_verts, verts, &ret_error); + ret_code = cugraph_type_erased_device_array_view_copy_to_host( + handle, (byte_t*)h_result_verts, verts, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); ret_code = cugraph_type_erased_device_array_view_copy_to_host( diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index 4767ac9853..3bb3970164 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -198,7 +198,7 @@ extern "C" int create_mg_test_graph(const cugraph_resource_handle_t* handle, NULL, NULL, store_transposed, - size_t{1}, // num_arrays + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -294,7 +294,7 @@ extern "C" int create_mg_test_graph_double(const cugraph_resource_handle_t* hand NULL, NULL, store_transposed, - size_t{1}, // num_arrays + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -384,7 +384,7 @@ extern "C" int create_mg_test_graph_with_edge_ids(const cugraph_resource_handle_ &idx_view, NULL, store_transposed, - size_t{1}, // num_arrays + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -509,7 +509,7 @@ extern "C" int create_mg_test_graph_with_properties(const cugraph_resource_handl &idx_view, &type_view, store_transposed, - size_t{1}, // num_arrays + size_t{1}, // num_arrays FALSE, FALSE, FALSE, @@ -640,7 +640,7 @@ int create_mg_test_graph_new(const cugraph_resource_handle_t* handle, &edge_id_view, &edge_type_view, store_transposed, - size_t{1}, // num_arrays + size_t{1}, // num_arrays FALSE, FALSE, FALSE, diff --git a/cpp/tests/c_api/sg_random_walks_test.c b/cpp/tests/c_api/sg_random_walks_test.c index 8d7328cba9..a6aeeff58d 100644 --- a/cpp/tests/c_api/sg_random_walks_test.c +++ b/cpp/tests/c_api/sg_random_walks_test.c @@ -70,8 +70,8 @@ int generic_uniform_random_walks_test(vertex_t* h_src, ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); - ret_code = - cugraph_uniform_random_walks(handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); + ret_code = cugraph_uniform_random_walks( + handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "uniform_random_walks failed."); @@ -192,13 +192,13 @@ int generic_biased_random_walks_test(vertex_t* h_src, handle, d_start_view, (byte_t*)h_start, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "start copy_from_host failed."); - + cugraph_rng_state_t* rng_state; ret_code = cugraph_rng_state_create(handle, 0, &rng_state, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "rng_state create failed."); - - ret_code = - cugraph_biased_random_walks(handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); + + ret_code = cugraph_biased_random_walks( + handle, rng_state, graph, d_start_view, max_depth, &result, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "biased_random_walks failed."); diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index 4f2c47cf41..8aeba6a5d3 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index 3b63e0ebff..617eb25a2b 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py index a4dab3578a..965d119ed7 100644 --- a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -60,10 +60,7 @@ def _call_plc_biased_random_walks(sID, mg_graph_x, st_x, max_depth, random_state def biased_random_walks( - input_graph, - start_vertices=None, - max_depth=None, - random_state=None + input_graph, start_vertices=None, max_depth=None, random_state=None ): """ compute random walks under the biased sampling framework for each nodes in @@ -82,7 +79,7 @@ def biased_random_walks( max_depth : int The maximum depth of the random walks - + random_state: int, optional Random seed to use when making sampling calls. diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py index 2ba3a2d238..3b800749b2 100644 --- a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -48,7 +48,9 @@ def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): return cudf.Series(cp_paths) -def _call_plc_node2vec_random_walks(sID, mg_graph_x, st_x, max_depth, p, q, random_state): +def _call_plc_node2vec_random_walks( + sID, mg_graph_x, st_x, max_depth, p, q, random_state +): return pylibcugraph_node2vec_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -57,18 +59,13 @@ def _call_plc_node2vec_random_walks(sID, mg_graph_x, st_x, max_depth, p, q, rand max_depth=max_depth, p=p, q=q, - random_state=random_state + random_state=random_state, ) # FIXME: Add type anotation def node2vec_random_walks( - input_graph, - start_vertices=None, - max_depth=None, - p=1.0, - q=1.0, - random_state=None + input_graph, start_vertices=None, max_depth=None, p=1.0, q=1.0, random_state=None ): """ compute random walks under the node2vec sampling framework for each nodes in @@ -101,7 +98,7 @@ def node2vec_random_walks( is likelier to visit nodes closer to the outgoing node. If q < 1, the random walk is likelier to visit nodes further from the outgoing node. A positive float. - + random_state: int, optional Random seed to use when making sampling calls. @@ -113,7 +110,7 @@ def node2vec_random_walks( edge_weight_paths: dask_cudf.Series Series containing the edge weights of edges represented by the returned vertex_paths - + and max_path_length : int @@ -130,8 +127,6 @@ def node2vec_random_walks( if (not isinstance(q, float)) or (q <= 0.0): raise ValueError(f"'q' must be a positive float, got: {q}") - - if isinstance(start_vertices, int): start_vertices = [start_vertices] @@ -201,4 +196,3 @@ def node2vec_random_walks( ) return ddf_vertex_paths, ddf_edge_wgt_paths, max_depth - diff --git a/python/cugraph/cugraph/dask/sampling/random_walks.py b/python/cugraph/cugraph/dask/sampling/random_walks.py index 07dfe93b16..80c241a7fc 100644 --- a/python/cugraph/cugraph/dask/sampling/random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -109,9 +109,9 @@ def random_walks( """ warning_msg = ( - "random_walks is deprecated and will be removed " - "in the next release in favor of uniform_random_walks" - ) + "random_walks is deprecated and will be removed " + "in the next release in favor of uniform_random_walks" + ) warnings.warn(warning_msg, FutureWarning) client = default_client() diff --git a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py index ba571a03e8..ee5152bc8a 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -60,10 +60,7 @@ def _call_plc_uniform_random_walks(sID, mg_graph_x, st_x, max_depth, random_stat def uniform_random_walks( - input_graph, - start_vertices=None, - max_depth=None, - random_state=None + input_graph, start_vertices=None, max_depth=None, random_state=None ): """ compute random walks under the uniform sampling framework for each nodes in @@ -82,7 +79,7 @@ def uniform_random_walks( max_depth : int The maximum depth of the random walks - + random_state: int, optional Random seed to use when making sampling calls. diff --git a/python/cugraph/cugraph/sampling/__init__.py b/python/cugraph/cugraph/sampling/__init__.py index 495483d135..88439d779e 100644 --- a/python/cugraph/cugraph/sampling/__init__.py +++ b/python/cugraph/cugraph/sampling/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/sampling/biased_random_walks.py b/python/cugraph/cugraph/sampling/biased_random_walks.py index 65b6524197..9c70f21f63 100644 --- a/python/cugraph/cugraph/sampling/biased_random_walks.py +++ b/python/cugraph/cugraph/sampling/biased_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -50,7 +50,7 @@ def biased_random_walks( The max depth is relative to the number of edges hence the vertex_paths size is max_depth + 1. For instance, a 'max_depth' of 2 with only one seed will result in a vertex_path of size 3. - + random_state: int, optional Random seed to use when making sampling calls. @@ -104,7 +104,7 @@ def biased_random_walks( input_graph=G._plc_graph, start_vertices=start_vertices, max_length=max_depth, - random_state=random_state + random_state=random_state, ) vertex_paths = cudf.Series(vertex_paths) diff --git a/python/cugraph/cugraph/sampling/node2vec.py b/python/cugraph/cugraph/sampling/node2vec.py index 2a5c0f9389..f7a1d3aa64 100644 --- a/python/cugraph/cugraph/sampling/node2vec.py +++ b/python/cugraph/cugraph/sampling/node2vec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -113,9 +113,9 @@ def node2vec(G, start_vertices, max_depth=1, compress_result=True, p=1.0, q=1.0) """ warning_msg = ( - "node2vec is deprecated and will be removed " - "in the next release in favor of node2vec_random_walks" - ) + "node2vec is deprecated and will be removed " + "in the next release in favor of node2vec_random_walks" + ) warnings.warn(warning_msg, FutureWarning) if (not isinstance(max_depth, int)) or (max_depth < 1): diff --git a/python/cugraph/cugraph/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/sampling/node2vec_random_walks.py index 3f8a6bdb66..1f93f3c61d 100644 --- a/python/cugraph/cugraph/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/sampling/node2vec_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -41,7 +41,9 @@ def ensure_valid_dtype(input_graph, start_vertices): return start_vertices -def node2vec_random_walks(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_state=None): +def node2vec_random_walks( + G, start_vertices, max_depth=1, p=1.0, q=1.0, random_state=None +): """ Computes random walks for each node in 'start_vertices', under the node2vec sampling framework. @@ -80,7 +82,7 @@ def node2vec_random_walks(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_s is likelier to visit nodes closer to the outgoing node. If q < 1, the random walk is likelier to visit nodes further from the outgoing node. A positive float. - + random_state: int, optional Random seed to use when making sampling calls. @@ -117,7 +119,6 @@ def node2vec_random_walks(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_s if (not isinstance(q, float)) or (q <= 0.0): raise ValueError(f"'q' must be a positive float, got: {q}") - if isinstance(start_vertices, int): start_vertices = [start_vertices] @@ -147,7 +148,7 @@ def node2vec_random_walks(G, start_vertices, max_depth=1, p=1.0, q=1.0, random_s max_depth=max_depth, p=p, q=q, - random_state=random_state + random_state=random_state, ) vertex_set = cudf.Series(vertex_set) edge_set = cudf.Series(edge_set) diff --git a/python/cugraph/cugraph/sampling/random_walks.py b/python/cugraph/cugraph/sampling/random_walks.py index eaf0581724..09e2e4d2ab 100644 --- a/python/cugraph/cugraph/sampling/random_walks.py +++ b/python/cugraph/cugraph/sampling/random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -97,7 +97,7 @@ def random_walks( use_padding : bool, optional (default=False) If True, padded paths are returned else coalesced paths are returned. - Deprecated: only padded paths will be returned in the results + Deprecated: only padded paths will be returned in the results legacy_result_type : bool, optional (default=True) If True, will return a tuple of vertex_paths, edge_weight_paths and @@ -137,9 +137,9 @@ def random_walks( """ warning_msg = ( - "random_walks is deprecated and will be removed " - "in the next release in favor of uniform_random_walks" - ) + "random_walks is deprecated and will be removed " + "in the next release in favor of uniform_random_walks" + ) warnings.warn(warning_msg, FutureWarning) # FIXME: Coalesced path results have been deprecated and should no longer be @@ -151,7 +151,7 @@ def random_walks( # returning results paths proprtional to the number of edges. Furthermore, # Coalesced path results should also be removed in favor of always returning # padded results. The flags 'legacy_result_type' and 'use_padding" should be - # removed. + # removed. if legacy_result_type or use_padding is False: warning_msg = ( @@ -160,7 +160,7 @@ def random_walks( "only padded paths will be returned instead" ) warnings.warn(warning_msg, PendingDeprecationWarning) - + if random_walks_type != "uniform": warning_msg = ( "random_walks_type is deprecated and will be removed " @@ -169,7 +169,6 @@ def random_walks( ) warnings.warn(warning_msg, FutureWarning) - if max_depth is None: raise TypeError("must specify a 'max_depth'") @@ -180,7 +179,7 @@ def random_walks( # Consider a different return type if Nx types are passed in. # The new API for random walk should instead always return the triple # (vertex_paths, edge_wgt_paths, max_path_length) - + G, _ = ensure_cugraph_obj_for_nx(G) if isinstance(start_vertices, int): @@ -300,9 +299,8 @@ def rw_path( """ warning_msg = ( - "This method is deprecated in favor of always returning " - "padded results." - ) + "This method is deprecated in favor of always returning " "padded results." + ) warnings.warn(warning_msg, PendingDeprecationWarning) diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py index 160bdc9c9f..ba4af23227 100644 --- a/python/cugraph/cugraph/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -50,7 +50,7 @@ def uniform_random_walks( The max depth is relative to the number of edges hence the vertex_paths size is max_depth + 1. For instance, a 'max_depth' of 2 with only one seed will result in a vertex_path of size 3. - + random_state: int, optional Random seed to use when making sampling calls. @@ -104,7 +104,7 @@ def uniform_random_walks( input_graph=G._plc_graph, start_vertices=start_vertices, max_length=max_depth, - random_state=random_state + random_state=random_state, ) vertex_paths = cudf.Series(vertex_paths) diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index d453c62001..3da38aa630 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 92c6459686..a532adf721 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd index e043b7672b..604ef77d6a 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/algorithms.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -395,4 +395,4 @@ cdef extern from "cugraph_c/algorithms.h": double q, cugraph_random_walk_result_t** result, cugraph_error_t** error - ) \ No newline at end of file + ) diff --git a/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx b/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx index 2f37de7e93..c6847af424 100644 --- a/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/biased_random_walks.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -82,7 +82,7 @@ def biased_random_walks(ResourceHandle resource_handle, max_length: size_t The maximum depth of the biased random walks - + random_state: int (Optional) Random state to use when generating samples. Optional argument, defaults to a hash of process id, time, and hostname. @@ -114,9 +114,9 @@ def biased_random_walks(ResourceHandle resource_handle, cai_start_ptr, len(start_vertices), get_c_type_from_numpy_type(start_vertices.dtype)) - + cg_rng_state = CuGraphRandomState(resource_handle, random_state) - + cdef cugraph_rng_state_t* rng_state_ptr = \ cg_rng_state.rng_state_ptr diff --git a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx index 59e7bd96c4..a8ce23a01a 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec_random_walks.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -94,7 +94,7 @@ def node2vec_random_walks(ResourceHandle resource_handle, further from the outgoing node. If q > 1, the random walk is likelier to visit nodes closer to the outgoing node. If q < 1, the random walk is likelier to visit nodes further from the outgoing node. - + random_state: int (Optional) Random state to use when generating samples. Optional argument, defaults to a hash of process id, time, and hostname. @@ -152,7 +152,7 @@ def node2vec_random_walks(ResourceHandle resource_handle, get_c_type_from_numpy_type(seed_array.dtype)) cg_rng_state = CuGraphRandomState(resource_handle, random_state) - + cdef cugraph_rng_state_t* rng_state_ptr = \ cg_rng_state.rng_state_ptr diff --git a/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx b/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx index 95379254e4..f3889264c0 100644 --- a/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx +++ b/python/pylibcugraph/pylibcugraph/uniform_random_walks.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -82,7 +82,7 @@ def uniform_random_walks(ResourceHandle resource_handle, max_length: size_t The maximum depth of the uniform random walks - + random_state: int (Optional) Random state to use when generating samples. Optional argument, defaults to a hash of process id, time, and hostname. @@ -114,9 +114,9 @@ def uniform_random_walks(ResourceHandle resource_handle, cai_start_ptr, len(start_vertices), get_c_type_from_numpy_type(start_vertices.dtype)) - + cg_rng_state = CuGraphRandomState(resource_handle, random_state) - + cdef cugraph_rng_state_t* rng_state_ptr = \ cg_rng_state.rng_state_ptr From 6418b9695272b3029ea8d05da03229f8d48007af Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 10 Jan 2025 16:43:09 -0800 Subject: [PATCH 37/60] remove unsued import --- python/cugraph/cugraph/sampling/biased_random_walks.py | 1 - python/cugraph/cugraph/sampling/uniform_random_walks.py | 1 - 2 files changed, 2 deletions(-) diff --git a/python/cugraph/cugraph/sampling/biased_random_walks.py b/python/cugraph/cugraph/sampling/biased_random_walks.py index 9c70f21f63..41340cba8a 100644 --- a/python/cugraph/cugraph/sampling/biased_random_walks.py +++ b/python/cugraph/cugraph/sampling/biased_random_walks.py @@ -12,7 +12,6 @@ # limitations under the License. import cudf -import cupy as cp from pylibcugraph import ResourceHandle from pylibcugraph import ( biased_random_walks as pylibcugraph_biased_random_walks, diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py index ba4af23227..99d6695824 100644 --- a/python/cugraph/cugraph/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -12,7 +12,6 @@ # limitations under the License. import cudf -import cupy as cp from pylibcugraph import ResourceHandle from pylibcugraph import ( uniform_random_walks as pylibcugraph_uniform_random_walks, From af3b31f62c4bd31a170fcc17294f6ae808201059 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 12 Jan 2025 17:40:25 -0800 Subject: [PATCH 38/60] add type annotations --- .../dask/sampling/biased_random_walks.py | 30 ++++++++++++--- .../dask/sampling/node2vec_random_walks.py | 37 ++++++++++++++----- .../dask/sampling/uniform_random_walks.py | 24 ++++++++++-- 3 files changed, 71 insertions(+), 20 deletions(-) diff --git a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py index 965d119ed7..9a70ab658c 100644 --- a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py @@ -15,6 +15,7 @@ from dask.distributed import wait, default_client import dask_cudf import cudf +import cupy as cp import operator as op from cugraph.dask.common.part_utils import ( persist_dask_df_equal_parts_per_worker, @@ -27,9 +28,14 @@ ) from cugraph.dask.comms import comms as Comms +from typing import Tuple, Union -def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): +def convert_to_cudf( + cp_paths: cp.ndarray, + number_map=None, + is_vertex_paths: bool = False + ) -> cudf.Series: """ Creates cudf Series from cupy arrays from pylibcugraph wrapper """ @@ -48,7 +54,13 @@ def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): return cudf.Series(cp_paths) -def _call_plc_biased_random_walks(sID, mg_graph_x, st_x, max_depth, random_state): +def _call_plc_biased_random_walks( + sID: bytes, + mg_graph_x, + st_x: cudf.Series, + max_depth: int, + random_state: int + ) -> Tuple[cp.ndarray, cp.ndarray]: return pylibcugraph_biased_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -60,8 +72,12 @@ def _call_plc_biased_random_walks(sID, mg_graph_x, st_x, max_depth, random_state def biased_random_walks( - input_graph, start_vertices=None, max_depth=None, random_state=None -): + input_graph, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series + ] = None, + max_depth: int = 1, + random_state: int = None +) -> Tuple[Union[dask_cudf.Series, dask_cudf.DataFrame], dask_cudf.Series, int]: """ compute random walks under the biased sampling framework for each nodes in 'start_vertices' and returns a padded result along with the maximum path length. @@ -77,8 +93,10 @@ def biased_random_walks( the random walks. In case of multi-column vertices it should be a cudf.DataFrame - max_depth : int - The maximum depth of the random walks + max_depth: int + The maximum depth of the random walks. If not specified, the maximum + depth is set to 1. + Must be a positive integer random_state: int, optional Random seed to use when making sampling calls. diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py index 3b800749b2..d3e84c4c42 100644 --- a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -15,21 +15,26 @@ from dask.distributed import wait, default_client import dask_cudf import cudf +import cupy as cp import operator as op from cugraph.dask.common.part_utils import ( persist_dask_df_equal_parts_per_worker, ) -from pylibcugraph import ResourceHandle - from pylibcugraph import ( + ResourceHandle, node2vec_random_walks as pylibcugraph_node2vec_random_walks, ) from cugraph.dask.comms import comms as Comms +from typing import Tuple, Union -def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): +def convert_to_cudf( + cp_paths: cp.ndarray, + number_map=None, + is_vertex_paths: bool = False + ) -> cudf.Series: """ Creates cudf Series from cupy arrays from pylibcugraph wrapper """ @@ -49,8 +54,14 @@ def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): def _call_plc_node2vec_random_walks( - sID, mg_graph_x, st_x, max_depth, p, q, random_state -): + sID: bytes, + mg_graph_x, + st_x: cudf.Series, + max_depth: int, + p: float, + q: float, + random_state: int +) -> Tuple[cp.ndarray, cp.ndarray]: return pylibcugraph_node2vec_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -63,10 +74,15 @@ def _call_plc_node2vec_random_walks( ) -# FIXME: Add type anotation def node2vec_random_walks( - input_graph, start_vertices=None, max_depth=None, p=1.0, q=1.0, random_state=None -): + input_graph, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series + ] = None, + max_depth: int = 1, + p: float = 1.0, + q: float = 1.0, + random_state: int = None +) -> Tuple[Union[dask_cudf.Series, dask_cudf.DataFrame], dask_cudf.Series, int]: """ compute random walks under the node2vec sampling framework for each nodes in 'start_vertices' and returns a padded result along with the maximum path length. @@ -82,9 +98,10 @@ def node2vec_random_walks( the random walks. In case of multi-column vertices it should be a cudf.DataFrame. Only supports int32 currently. - max_depth: int, optional (default=1) + max_depth: int The maximum depth of the random walks. If not specified, the maximum depth is set to 1. + Must be a positive integer p: float, optional (default=1.0, [0 < p]) Return factor, which represents the likelihood of backtracking to @@ -158,7 +175,7 @@ def node2vec_random_walks( Comms.get_session_id(), input_graph._plc_graph[w], start_v[0] if start_v else cudf.Series(dtype=start_vertices_type), - max_depth, + max_depth if isinstance(max_depth, int) else 1, p=p, q=q, random_state=random_state, diff --git a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py index ee5152bc8a..429ec00ae0 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py @@ -15,6 +15,7 @@ from dask.distributed import wait, default_client import dask_cudf import cudf +import cupy as cp import operator as op from cugraph.dask.common.part_utils import ( persist_dask_df_equal_parts_per_worker, @@ -27,9 +28,14 @@ ) from cugraph.dask.comms import comms as Comms +from typing import Tuple, Union -def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): +def convert_to_cudf( + cp_paths: cp.ndarray, + number_map=None, + is_vertex_paths: bool = False + ) -> cudf.Series: """ Creates cudf Series from cupy arrays from pylibcugraph wrapper """ @@ -48,7 +54,13 @@ def convert_to_cudf(cp_paths, number_map=None, is_vertex_paths=False): return cudf.Series(cp_paths) -def _call_plc_uniform_random_walks(sID, mg_graph_x, st_x, max_depth, random_state): +def _call_plc_uniform_random_walks( + sID: bytes, + mg_graph_x, + st_x: cudf.Series, + max_depth: int, + random_state: int + ) -> Tuple[cp.ndarray, cp.ndarray]: return pylibcugraph_uniform_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -60,8 +72,12 @@ def _call_plc_uniform_random_walks(sID, mg_graph_x, st_x, max_depth, random_stat def uniform_random_walks( - input_graph, start_vertices=None, max_depth=None, random_state=None -): + input_graph, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series + ] = None, + max_depth: int = 1, + random_state: int = None +) -> Tuple[Union[dask_cudf.Series, dask_cudf.DataFrame], dask_cudf.Series, int]: """ compute random walks under the uniform sampling framework for each nodes in 'start_vertices' and returns a padded result along with the maximum path length. From f0e3b0faf05e03743853ca839b0178545a591edf Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 12 Jan 2025 17:44:43 -0800 Subject: [PATCH 39/60] fix style --- .../dask/sampling/biased_random_walks.py | 19 ++++++------------- .../dask/sampling/node2vec_random_walks.py | 15 ++++++--------- .../dask/sampling/uniform_random_walks.py | 19 ++++++------------- 3 files changed, 18 insertions(+), 35 deletions(-) diff --git a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py index 9a70ab658c..277dce6894 100644 --- a/python/cugraph/cugraph/dask/sampling/biased_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/biased_random_walks.py @@ -32,10 +32,8 @@ def convert_to_cudf( - cp_paths: cp.ndarray, - number_map=None, - is_vertex_paths: bool = False - ) -> cudf.Series: + cp_paths: cp.ndarray, number_map=None, is_vertex_paths: bool = False +) -> cudf.Series: """ Creates cudf Series from cupy arrays from pylibcugraph wrapper """ @@ -55,12 +53,8 @@ def convert_to_cudf( def _call_plc_biased_random_walks( - sID: bytes, - mg_graph_x, - st_x: cudf.Series, - max_depth: int, - random_state: int - ) -> Tuple[cp.ndarray, cp.ndarray]: + sID: bytes, mg_graph_x, st_x: cudf.Series, max_depth: int, random_state: int +) -> Tuple[cp.ndarray, cp.ndarray]: return pylibcugraph_biased_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -73,10 +67,9 @@ def _call_plc_biased_random_walks( def biased_random_walks( input_graph, - start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series - ] = None, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series] = None, max_depth: int = 1, - random_state: int = None + random_state: int = None, ) -> Tuple[Union[dask_cudf.Series, dask_cudf.DataFrame], dask_cudf.Series, int]: """ compute random walks under the biased sampling framework for each nodes in diff --git a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py index d3e84c4c42..96582cdd7d 100644 --- a/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/node2vec_random_walks.py @@ -31,10 +31,8 @@ def convert_to_cudf( - cp_paths: cp.ndarray, - number_map=None, - is_vertex_paths: bool = False - ) -> cudf.Series: + cp_paths: cp.ndarray, number_map=None, is_vertex_paths: bool = False +) -> cudf.Series: """ Creates cudf Series from cupy arrays from pylibcugraph wrapper """ @@ -58,9 +56,9 @@ def _call_plc_node2vec_random_walks( mg_graph_x, st_x: cudf.Series, max_depth: int, - p: float, + p: float, q: float, - random_state: int + random_state: int, ) -> Tuple[cp.ndarray, cp.ndarray]: return pylibcugraph_node2vec_random_walks( @@ -76,12 +74,11 @@ def _call_plc_node2vec_random_walks( def node2vec_random_walks( input_graph, - start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series - ] = None, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series] = None, max_depth: int = 1, p: float = 1.0, q: float = 1.0, - random_state: int = None + random_state: int = None, ) -> Tuple[Union[dask_cudf.Series, dask_cudf.DataFrame], dask_cudf.Series, int]: """ compute random walks under the node2vec sampling framework for each nodes in diff --git a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py index 429ec00ae0..dd2a069ff8 100644 --- a/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/dask/sampling/uniform_random_walks.py @@ -32,10 +32,8 @@ def convert_to_cudf( - cp_paths: cp.ndarray, - number_map=None, - is_vertex_paths: bool = False - ) -> cudf.Series: + cp_paths: cp.ndarray, number_map=None, is_vertex_paths: bool = False +) -> cudf.Series: """ Creates cudf Series from cupy arrays from pylibcugraph wrapper """ @@ -55,12 +53,8 @@ def convert_to_cudf( def _call_plc_uniform_random_walks( - sID: bytes, - mg_graph_x, - st_x: cudf.Series, - max_depth: int, - random_state: int - ) -> Tuple[cp.ndarray, cp.ndarray]: + sID: bytes, mg_graph_x, st_x: cudf.Series, max_depth: int, random_state: int +) -> Tuple[cp.ndarray, cp.ndarray]: return pylibcugraph_uniform_random_walks( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -73,10 +67,9 @@ def _call_plc_uniform_random_walks( def uniform_random_walks( input_graph, - start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series - ] = None, + start_vertices: Union[int, list, cudf.Series, cudf.DataFrame, cudf.Series] = None, max_depth: int = 1, - random_state: int = None + random_state: int = None, ) -> Tuple[Union[dask_cudf.Series, dask_cudf.DataFrame], dask_cudf.Series, int]: """ compute random walks under the uniform sampling framework for each nodes in From 74648d45dcc417dfdd685b139447f82a700dc64c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 12 Jan 2025 17:48:55 -0800 Subject: [PATCH 40/60] deprecated old test suite --- python/cugraph/cugraph/tests/sampling/test_random_walks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_random_walks.py index 76ceb47851..efb58e3ba1 100644 --- a/python/cugraph/cugraph/tests/sampling/test_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION.: +# Copyright (c) 2020-2025, NVIDIA CORPORATION.: # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -40,6 +40,8 @@ def setup_function(): gc.collect() +# FIXME: This test suite must be removed once random_walks is removed from +# the python API in favor of uniform random walks def calc_random_walks(G, max_depth=None, use_padding=False, legacy_result_type=True): """ compute random walks for each nodes in 'start_vertices' From 70b8d8bcfdefce450b58dfaac700330faefb8c07 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 12 Jan 2025 18:09:18 -0800 Subject: [PATCH 41/60] add sg tests for uniform random walks --- .../sampling/test_uniform_random_walks.py | 257 ++++++++++++++++++ 1 file changed, 257 insertions(+) create mode 100644 python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py new file mode 100644 index 0000000000..f537761574 --- /dev/null +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py @@ -0,0 +1,257 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION.: +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest +import networkx as nx + +import cudf +import cugraph +from cudf.testing import assert_series_equal +from cugraph.utilities import ensure_cugraph_obj_for_nx +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS + + +# ============================================================================= +# Parameters +# ============================================================================= +DIRECTED_GRAPH_OPTIONS = [False, True] +WEIGHTED_GRAPH_OPTIONS = [False, True] +DATASETS = [pytest.param(d) for d in DEFAULT_DATASETS] +SMALL_DATASETS = [pytest.param(d) for d in SMALL_DATASETS] + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + +def calc_uniform_random_walks(G, max_depth=None): + """ + compute random walks for each nodes in 'start_vertices' + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed or undirected. + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + max_depth : int + The maximum depth of the random walks + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + sizes: int + The path size in case of coalesced paths. + """ + assert G is not None + + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") + + k = random.randint(1, 6) + + random_walks_type = "uniform" + + start_vertices = G.select_random_vertices(num_vertices=k) + + print("\nstart_vertices is \n", start_vertices) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.uniform_random_walks( + G, start_vertices, max_depth + ) + + return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices + + + + +def check_random_walks(G, path_data, seeds, max_depth): + invalid_edge = 0 + invalid_seeds = 0 + invalid_edge_wgt = 0 + v_paths = path_data[0] + e_wgt_paths = path_data[1] + e_wgt_idx = 0 + + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") + df_G = G.input_df + + if "weight" in df_G.columns: + df_G = df_G.rename(columns={"weight": "wgt"}) + + total_depth = (max_depth) * len(seeds) + + for i in range(total_depth - 1): + vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1] + + # Every max_depth'th vertex in 'v_paths' is a seed + # instead of 'seeds[i // (max_depth)]', could have just pop the first element + # of the seeds array once there is a match and compare it to 'vertex_1' + if i % (max_depth) == 0 and vertex_1 != seeds[i // (max_depth)]: + invalid_seeds += 1 + print( + "[ERR] Invalid seed: " + " src {} != src {}".format(vertex_1, seeds[i // (max_depth)]) + ) + + if (i % (max_depth)) != (max_depth - 1): + # These are the edges + src = vertex_1 + dst = vertex_2 + + if src != -1 and dst != -1: + # check for valid edge. + edge = df_G.loc[ + (df_G["src"] == (src)) & (df_G["dst"] == (dst)) + ].reset_index(drop=True) + + if len(edge) == 0: + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}".format(src, dst) + ) + invalid_edge += 1 + + else: + # check valid edge wgt + if G.is_weighted(): + expected_wgt = edge["wgt"].iloc[0] + result_wgt = e_wgt_paths.iloc[e_wgt_idx] + + if expected_wgt != result_wgt: + print( + "[ERR] Invalid edge wgt: " + "The edge src {} dst {} has wgt {} but got {}".format( + src, dst, expected_wgt, result_wgt + ) + ) + invalid_edge_wgt += 1 + e_wgt_idx += 1 + + if src != -1 and dst == -1: + # ensure there is no outgoing edges from 'src' + assert G.out_degree([src])["degree"].iloc[0] == 0 + + assert invalid_seeds == 0 + assert invalid_edge == 0 + assert len(v_paths) == (max_depth) * len(seeds) + if G.is_weighted(): + assert invalid_edge_wgt == 0 + assert len(e_wgt_paths) == (max_depth - 1) * len(seeds) + + + max_path_lenth = path_data[2] + assert max_path_lenth == max_depth - 1 + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +@pytest.mark.parametrize("max_depth", [None]) +def test_uniform_random_walks_invalid_max_dept(graph_file, directed, max_depth): + + input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) + with pytest.raises(TypeError): + _, _, _ = calc_uniform_random_walks(input_graph, max_depth=max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_uniform_random_walks(graph_file, directed): + max_depth = random.randint(2, 10) + print("max_depth is ", max_depth) + input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) + + path_data, seeds = calc_uniform_random_walks( + input_graph, max_depth=max_depth + ) + + check_random_walks(input_graph, path_data, seeds, max_depth) + + path_data, seeds = calc_uniform_random_walks( + input_graph, max_depth=max_depth + ) + + check_random_walks(input_graph, path_data, seeds, max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +def test_uniform_random_walks_nx(graph_file): + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) + + M = G.to_pandas_edgelist() + + source = G.source_columns + target = G.destination_columns + edge_attr = G.weight_column + + Gnx = nx.from_pandas_edgelist( + M, + source=source, + target=target, + edge_attr=edge_attr, + create_using=nx.DiGraph(), + ) + max_depth = random.randint(2, 10) + path_data, seeds = calc_uniform_random_walks(Gnx, max_depth=max_depth) + + check_random_walks(Gnx, path_data, seeds, max_depth) + + +"""@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.sg +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_random_walks( + graph_file, + directed +): + max_depth = random.randint(2, 10) + df_G = utils.read_csv_file(graph_file) + df_G.rename( + columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + df_G['src_0'] = df_G['src'] + 1000 + df_G['dst_0'] = df_G['dst'] + 1000 + + if directed: + G = cugraph.Graph(directed=True) + else: + G = cugraph.Graph() + G.from_cudf_edgelist(df_G, source=['src', 'src_0'], + destination=['dst', 'dst_0'], + edge_attr="weight") + + k = random.randint(1, 10) + start_vertices = random.sample(G.nodes().to_numpy().tolist(), k) + + seeds = cudf.DataFrame() + seeds['v'] = start_vertices + seeds['v_0'] = seeds['v'] + 1000 + + df, offsets = cugraph.random_walks(G, seeds, max_depth) + + check_random_walks(df, offsets, seeds, df_G) +""" From 9b270c70411baa92c0f9755a8d98dad33bfe8d67 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 12 Jan 2025 18:10:01 -0800 Subject: [PATCH 42/60] update copyright --- .../cugraph/cugraph/tests/sampling/test_uniform_random_walks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py index f537761574..78579aa40c 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION.: +# Copyright (c) 2025, NVIDIA CORPORATION.: # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 9e850d24eec4ce3640500731b90c7bd6c9ecc79c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 13 Jan 2025 20:16:51 -0800 Subject: [PATCH 43/60] update tests --- .../sampling/test_uniform_random_walks.py | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py index 78579aa40c..9d5fef1695 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py @@ -21,7 +21,7 @@ import cugraph from cudf.testing import assert_series_equal from cugraph.utilities import ensure_cugraph_obj_for_nx -from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils # ============================================================================= @@ -87,7 +87,7 @@ def calc_uniform_random_walks(G, max_depth=None): -def check_random_walks(G, path_data, seeds, max_depth): +def check_uniform_random_walks(G, path_data, seeds, max_depth): invalid_edge = 0 invalid_seeds = 0 invalid_edge_wgt = 0 @@ -103,20 +103,20 @@ def check_random_walks(G, path_data, seeds, max_depth): total_depth = (max_depth) * len(seeds) - for i in range(total_depth - 1): + for i in range(total_depth): vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1] - # Every max_depth'th vertex in 'v_paths' is a seed - # instead of 'seeds[i // (max_depth)]', could have just pop the first element + # Every max_depth'th vertex in 'v_paths' is a seed instead of + # 'seeds[i // (max_depth + 1)]', could have just pop the first element # of the seeds array once there is a match and compare it to 'vertex_1' - if i % (max_depth) == 0 and vertex_1 != seeds[i // (max_depth)]: + if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: invalid_seeds += 1 print( "[ERR] Invalid seed: " - " src {} != src {}".format(vertex_1, seeds[i // (max_depth)]) + " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) ) - if (i % (max_depth)) != (max_depth - 1): + if (i % (max_depth + 1)) != (max_depth): # These are the edges src = vertex_1 dst = vertex_2 @@ -156,14 +156,14 @@ def check_random_walks(G, path_data, seeds, max_depth): assert invalid_seeds == 0 assert invalid_edge == 0 - assert len(v_paths) == (max_depth) * len(seeds) + assert len(v_paths) == (max_depth + 1) * len(seeds) if G.is_weighted(): assert invalid_edge_wgt == 0 - assert len(e_wgt_paths) == (max_depth - 1) * len(seeds) + assert len(e_wgt_paths) == (max_depth) * len(seeds) max_path_lenth = path_data[2] - assert max_path_lenth == max_depth - 1 + assert max_path_lenth == max_depth @pytest.mark.sg @@ -189,13 +189,10 @@ def test_uniform_random_walks(graph_file, directed): input_graph, max_depth=max_depth ) - check_random_walks(input_graph, path_data, seeds, max_depth) + print("path_data = \n", path_data) + print("seeds = \n", seeds) - path_data, seeds = calc_uniform_random_walks( - input_graph, max_depth=max_depth - ) - - check_random_walks(input_graph, path_data, seeds, max_depth) + check_uniform_random_walks(input_graph, path_data, seeds, max_depth) @pytest.mark.sg @@ -219,20 +216,21 @@ def test_uniform_random_walks_nx(graph_file): max_depth = random.randint(2, 10) path_data, seeds = calc_uniform_random_walks(Gnx, max_depth=max_depth) - check_random_walks(Gnx, path_data, seeds, max_depth) + check_uniform_random_walks(Gnx, path_data, seeds, max_depth) -"""@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +#"""@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.sg -@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +@pytest.mark.parametrize("graph_file", [SMALL_DATASETS[0]]) +@pytest.mark.parametrize("directed", [DIRECTED_GRAPH_OPTIONS[0]]) def test_random_walks( graph_file, directed ): max_depth = random.randint(2, 10) - df_G = utils.read_csv_file(graph_file) + df_G = graph_file.get_edgelist() df_G.rename( - columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + columns={"wgt": "weight"}, inplace=True) df_G['src_0'] = df_G['src'] + 1000 df_G['dst_0'] = df_G['dst'] + 1000 @@ -245,13 +243,16 @@ def test_random_walks( edge_attr="weight") k = random.randint(1, 10) - start_vertices = random.sample(G.nodes().to_numpy().tolist(), k) + #start_vertices = random.sample(G.nodes().to_numpy().tolist(), k) + + start_vertices = G.select_random_vertices(num_vertices=k) seeds = cudf.DataFrame() seeds['v'] = start_vertices + print("seeds = \n", seeds) seeds['v_0'] = seeds['v'] + 1000 - df, offsets = cugraph.random_walks(G, seeds, max_depth) + df, offsets = cugraph.uniform-random_walks(G, seeds, max_depth) - check_random_walks(df, offsets, seeds, df_G) -""" + check_uniform_random_walks(df, offsets, seeds, df_G) +#""" From 10c471afbe971ffab7fccc7bcdd46f0cec2f96dd Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 14 Jan 2025 09:20:31 -0800 Subject: [PATCH 44/60] add support of multi column seeds --- python/cugraph/cugraph/sampling/uniform_random_walks.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/sampling/uniform_random_walks.py b/python/cugraph/cugraph/sampling/uniform_random_walks.py index 99d6695824..d0e71f401d 100644 --- a/python/cugraph/cugraph/sampling/uniform_random_walks.py +++ b/python/cugraph/cugraph/sampling/uniform_random_walks.py @@ -112,7 +112,10 @@ def uniform_random_walks( df_ = cudf.DataFrame() df_["vertex_paths"] = vertex_paths df_ = G.unrenumber(df_, "vertex_paths", preserve_order=True) - vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + if len(df_.columns) > 1: + vertex_paths = df_.fillna(-1) + else: + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) edge_wgt_paths = cudf.Series(edge_wgt_paths) From b94bad22d33b48137bd13bd22a0176d1a0911a99 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 14 Jan 2025 09:21:57 -0800 Subject: [PATCH 45/60] add support of multi column seeds for 'select_random_vertices' --- .../cugraph/structure/graph_implementation/simpleGraph.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index 4523b7f13b..d9beba70e4 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -886,7 +886,10 @@ def select_random_vertices( df_ = cudf.DataFrame() df_["vertex"] = vertices df_ = self.renumber_map.unrenumber(df_, "vertex") - vertices = df_["vertex"] + if len(df_.columns) > 1: + vertices = df_ + else: + vertices = df_["vertex"] return vertices From af68c3f22357e5ec97b7a9df137c10c0a4008576 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 14 Jan 2025 09:22:47 -0800 Subject: [PATCH 46/60] add multi column tests --- .../sampling/test_uniform_random_walks.py | 82 +++++++++++-------- 1 file changed, 49 insertions(+), 33 deletions(-) diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py index 9d5fef1695..7fca738575 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py @@ -20,6 +20,7 @@ import cudf import cugraph from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal from cugraph.utilities import ensure_cugraph_obj_for_nx from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils @@ -104,25 +105,48 @@ def check_uniform_random_walks(G, path_data, seeds, max_depth): total_depth = (max_depth) * len(seeds) for i in range(total_depth): - vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1] + if isinstance(seeds, cudf.DataFrame): + vertex_1 = v_paths.iloc[[i]].reset_index(drop=True) + vertex_2 = v_paths.iloc[[i + 1]].reset_index(drop=True) + else: + vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1] # Every max_depth'th vertex in 'v_paths' is a seed instead of # 'seeds[i // (max_depth + 1)]', could have just pop the first element # of the seeds array once there is a match and compare it to 'vertex_1' - if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: - invalid_seeds += 1 - print( - "[ERR] Invalid seed: " - " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) - ) + + if i % (max_depth + 1) == 0: + if isinstance(seeds, cudf.DataFrame): + assert_frame_equal( + vertex_1.rename(columns={x:y for x,y in zip(vertex_1.columns,range(0,len(vertex_1.columns)))}), + seeds.iloc[[i // (max_depth + 1)]].reset_index(drop=True).rename(columns={x:y for x,y in zip(seeds.columns,range(0,len(seeds.columns)))}), + check_dtype=False, check_like=True) + else: + if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: + invalid_seeds += 1 + print( + "[ERR] Invalid seed: " + " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) + ) if (i % (max_depth + 1)) != (max_depth): # These are the edges src = vertex_1 dst = vertex_2 - - if src != -1 and dst != -1: - # check for valid edge. + + # check for valid edge. + if isinstance(seeds, cudf.DataFrame): + if (-1 not in src.iloc[0].reset_index(drop=True)) and (-1 not in dst.iloc[0].reset_index(drop=True)): + edge = cudf.DataFrame() + edge["src"] = vertex_1["0_vertex_paths"] + edge["src_0"] = vertex_1["1_vertex_paths"] + edge["dst"] = vertex_2["0_vertex_paths"] + edge["dst_0"] = vertex_2["1_vertex_paths"] + + join1 = cudf.merge(df_G, edge, on=[*edge.columns]) + + assert len(cudf.merge(df_G, edge, on=[*edge.columns])) > 0 + else: edge = df_G.loc[ (df_G["src"] == (src)) & (df_G["dst"] == (dst)) ].reset_index(drop=True) @@ -148,11 +172,11 @@ def check_uniform_random_walks(G, path_data, seeds, max_depth): ) ) invalid_edge_wgt += 1 - e_wgt_idx += 1 + e_wgt_idx += 1 - if src != -1 and dst == -1: - # ensure there is no outgoing edges from 'src' - assert G.out_degree([src])["degree"].iloc[0] == 0 + if src != -1 and dst == -1: + # ensure there is no outgoing edges from 'src' + assert G.out_degree([src])["degree"].iloc[0] == 0 assert invalid_seeds == 0 assert invalid_edge == 0 @@ -189,9 +213,6 @@ def test_uniform_random_walks(graph_file, directed): input_graph, max_depth=max_depth ) - print("path_data = \n", path_data) - print("seeds = \n", seeds) - check_uniform_random_walks(input_graph, path_data, seeds, max_depth) @@ -219,11 +240,10 @@ def test_uniform_random_walks_nx(graph_file): check_uniform_random_walks(Gnx, path_data, seeds, max_depth) -#"""@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.sg -@pytest.mark.parametrize("graph_file", [SMALL_DATASETS[0]]) -@pytest.mark.parametrize("directed", [DIRECTED_GRAPH_OPTIONS[0]]) -def test_random_walks( +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_random_walks_multi_column_seeds( graph_file, directed ): @@ -243,16 +263,12 @@ def test_random_walks( edge_attr="weight") k = random.randint(1, 10) - #start_vertices = random.sample(G.nodes().to_numpy().tolist(), k) - start_vertices = G.select_random_vertices(num_vertices=k) - - seeds = cudf.DataFrame() - seeds['v'] = start_vertices - print("seeds = \n", seeds) - seeds['v_0'] = seeds['v'] + 1000 - - df, offsets = cugraph.uniform-random_walks(G, seeds, max_depth) - - check_uniform_random_walks(df, offsets, seeds, df_G) -#""" + seeds = G.select_random_vertices(num_vertices=k) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.uniform_random_walks( + G, seeds, max_depth) + + path_data = (vertex_paths, edge_weights, vertex_path_sizes) + + check_uniform_random_walks(G, path_data, seeds, max_depth) + \ No newline at end of file From 0c8f85edda2489846e4cd6b25312ccc132082b4d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 14 Jan 2025 09:36:23 -0800 Subject: [PATCH 47/60] add support of multi column seeds --- .../cugraph/sampling/biased_random_walks.py | 5 ++++- .../cugraph/sampling/node2vec_random_walks.py | 18 +++++++++++------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/sampling/biased_random_walks.py b/python/cugraph/cugraph/sampling/biased_random_walks.py index 41340cba8a..ca35d88a3d 100644 --- a/python/cugraph/cugraph/sampling/biased_random_walks.py +++ b/python/cugraph/cugraph/sampling/biased_random_walks.py @@ -112,7 +112,10 @@ def biased_random_walks( df_ = cudf.DataFrame() df_["vertex_paths"] = vertex_paths df_ = G.unrenumber(df_, "vertex_paths", preserve_order=True) - vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + if len(df_.columns) > 1: + vertex_paths = df_.fillna(-1) + else: + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) edge_wgt_paths = cudf.Series(edge_wgt_paths) diff --git a/python/cugraph/cugraph/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/sampling/node2vec_random_walks.py index 1f93f3c61d..68e9f3f072 100644 --- a/python/cugraph/cugraph/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/sampling/node2vec_random_walks.py @@ -141,7 +141,7 @@ def node2vec_random_walks( start_vertices = ensure_valid_dtype(G, start_vertices) - vertex_set, edge_set = pylibcugraph_node2vec_random_walks( + vertex_paths, edge_wgt_paths = pylibcugraph_node2vec_random_walks( resource_handle=ResourceHandle(), graph=G._plc_graph, seed_array=start_vertices, @@ -150,12 +150,16 @@ def node2vec_random_walks( q=q, random_state=random_state, ) - vertex_set = cudf.Series(vertex_set) - edge_set = cudf.Series(edge_set) + vertex_paths = cudf.Series(vertex_paths) + edge_wgt_paths = cudf.Series(edge_wgt_paths) if G.renumbered: df_ = cudf.DataFrame() - df_["vertex_set"] = vertex_set - df_ = G.unrenumber(df_, "vertex_set", preserve_order=True) - vertex_set = cudf.Series(df_["vertex_set"]) - return vertex_set, edge_set, max_depth + df_["vertex_paths"] = vertex_paths + df_ = G.unrenumber(df_, "vertex_paths", preserve_order=True) + if len(df_.columns) > 1: + vertex_paths = df_.fillna(-1) + else: + vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) + + return vertex_paths, edge_wgt_paths, max_depth From 37d3a47bab12b6f0067c04f75fc767db7b224f13 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 14 Jan 2025 09:40:18 -0800 Subject: [PATCH 48/60] add test for biased random walks --- .../sampling/test_biased_random_walks.py | 250 ++++++++++++++++++ 1 file changed, 250 insertions(+) create mode 100644 python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py diff --git a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py new file mode 100644 index 0000000000..b6097aa1ed --- /dev/null +++ b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py @@ -0,0 +1,250 @@ +# Copyright (c) 2025, NVIDIA CORPORATION.: +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest +import networkx as nx + +import cudf +import cugraph +from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal +from cugraph.utilities import ensure_cugraph_obj_for_nx +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils + + +# ============================================================================= +# Parameters +# ============================================================================= +DIRECTED_GRAPH_OPTIONS = [False, True] +WEIGHTED_GRAPH_OPTIONS = [False, True] +DATASETS = [pytest.param(d) for d in DEFAULT_DATASETS] +SMALL_DATASETS = [pytest.param(d) for d in SMALL_DATASETS] + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + +def calc_biased_random_walks(G, max_depth=None): + """ + compute random walks for each nodes in 'start_vertices' + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed or undirected. + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + max_depth : int + The maximum depth of the random walks + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + sizes: int + The path size in case of coalesced paths. + """ + assert G is not None + + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") + + k = random.randint(1, 6) + + random_walks_type = "biased" + + start_vertices = G.select_random_vertices(num_vertices=k) + + print("\nstart_vertices is \n", start_vertices) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.biased_random_walks( + G, start_vertices, max_depth + ) + + return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices + + + + +def check_biased_random_walks(G, path_data, seeds, max_depth): + invalid_edge = 0 + invalid_seeds = 0 + invalid_edge_wgt = 0 + v_paths = path_data[0] + e_wgt_paths = path_data[1] + e_wgt_idx = 0 + + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") + df_G = G.input_df + + if "weight" in df_G.columns: + df_G = df_G.rename(columns={"weight": "wgt"}) + + total_depth = (max_depth) * len(seeds) + + for i in range(total_depth): + if isinstance(seeds, cudf.DataFrame): + vertex_1 = v_paths.iloc[[i]].reset_index(drop=True) + vertex_2 = v_paths.iloc[[i + 1]].reset_index(drop=True) + else: + vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1] + + # Every max_depth'th vertex in 'v_paths' is a seed instead of + # 'seeds[i // (max_depth + 1)]', could have just pop the first element + # of the seeds array once there is a match and compare it to 'vertex_1' + + if i % (max_depth + 1) == 0: + if isinstance(seeds, cudf.DataFrame): + assert_frame_equal( + vertex_1.rename(columns={x:y for x,y in zip(vertex_1.columns,range(0,len(vertex_1.columns)))}), + seeds.iloc[[i // (max_depth + 1)]].reset_index(drop=True).rename(columns={x:y for x,y in zip(seeds.columns,range(0,len(seeds.columns)))}), + check_dtype=False, check_like=True) + else: + if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: + invalid_seeds += 1 + print( + "[ERR] Invalid seed: " + " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) + ) + + if (i % (max_depth + 1)) != (max_depth): + # These are the edges + src = vertex_1 + dst = vertex_2 + + # check for valid edge. + if isinstance(seeds, cudf.DataFrame): + if (-1 not in src.iloc[0].reset_index(drop=True)) and (-1 not in dst.iloc[0].reset_index(drop=True)): + edge = cudf.DataFrame() + edge["src"] = vertex_1["0_vertex_paths"] + edge["src_0"] = vertex_1["1_vertex_paths"] + edge["dst"] = vertex_2["0_vertex_paths"] + edge["dst_0"] = vertex_2["1_vertex_paths"] + + join1 = cudf.merge(df_G, edge, on=[*edge.columns]) + + assert len(cudf.merge(df_G, edge, on=[*edge.columns])) > 0 + else: + edge = df_G.loc[ + (df_G["src"] == (src)) & (df_G["dst"] == (dst)) + ].reset_index(drop=True) + + if len(edge) == 0: + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}".format(src, dst) + ) + invalid_edge += 1 + + else: + # check valid edge wgt + if G.is_weighted(): + expected_wgt = edge["wgt"].iloc[0] + result_wgt = e_wgt_paths.iloc[e_wgt_idx] + + if expected_wgt != result_wgt: + print( + "[ERR] Invalid edge wgt: " + "The edge src {} dst {} has wgt {} but got {}".format( + src, dst, expected_wgt, result_wgt + ) + ) + invalid_edge_wgt += 1 + e_wgt_idx += 1 + + if src != -1 and dst == -1: + # ensure there is no outgoing edges from 'src' + assert G.out_degree([src])["degree"].iloc[0] == 0 + + assert invalid_seeds == 0 + assert invalid_edge == 0 + assert len(v_paths) == (max_depth + 1) * len(seeds) + if G.is_weighted(): + assert invalid_edge_wgt == 0 + assert len(e_wgt_paths) == (max_depth) * len(seeds) + + + max_path_lenth = path_data[2] + assert max_path_lenth == max_depth + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +@pytest.mark.parametrize("max_depth", [None]) +def test_biased_random_walks_invalid_max_dept(graph_file, directed, max_depth): + + input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) + with pytest.raises(TypeError): + _, _, _ = calc_biased_random_walks(input_graph, max_depth=max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_biased_random_walks(graph_file, directed): + max_depth = random.randint(2, 10) + print("max_depth is ", max_depth) + input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) + + path_data, seeds = calc_biased_random_walks( + input_graph, max_depth=max_depth + ) + + check_biased_random_walks(input_graph, path_data, seeds, max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_random_walks_multi_column_seeds( + graph_file, + directed +): + max_depth = random.randint(2, 10) + df_G = graph_file.get_edgelist() + df_G.rename( + columns={"wgt": "weight"}, inplace=True) + df_G['src_0'] = df_G['src'] + 1000 + df_G['dst_0'] = df_G['dst'] + 1000 + + if directed: + G = cugraph.Graph(directed=True) + else: + G = cugraph.Graph() + G.from_cudf_edgelist(df_G, source=['src', 'src_0'], + destination=['dst', 'dst_0'], + edge_attr="weight") + + k = random.randint(1, 10) + + seeds = G.select_random_vertices(num_vertices=k) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.biased_random_walks( + G, seeds, max_depth) + + path_data = (vertex_paths, edge_weights, vertex_path_sizes) + + check_biased_random_walks(G, path_data, seeds, max_depth) + \ No newline at end of file From e7a5952e4a11582994a79aad4d3e7c74255047f1 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 15 Jan 2025 07:38:25 -0800 Subject: [PATCH 49/60] add mg ECG --- python/cugraph/cugraph/dask/community/ecg.py | 219 +++++++++++++++++++ 1 file changed, 219 insertions(+) create mode 100644 python/cugraph/cugraph/dask/community/ecg.py diff --git a/python/cugraph/cugraph/dask/community/ecg.py b/python/cugraph/cugraph/dask/community/ecg.py new file mode 100644 index 0000000000..8556b4da78 --- /dev/null +++ b/python/cugraph/cugraph/dask/community/ecg.py @@ -0,0 +1,219 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +from dask.distributed import wait, default_client +import cugraph.dask.comms.comms as Comms +import dask_cudf +import dask +from dask import delayed +import cudf + +from pylibcugraph import ResourceHandle +from pylibcugraph import ecg as pylibcugraph_ecg +import numpy +import cupy as cp +from typing import Tuple, TYPE_CHECKING + +if TYPE_CHECKING: + from cugraph import Graph + + +def convert_to_cudf(result: cp.ndarray) -> Tuple[cudf.DataFrame, float]: + """ + Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper + """ + cupy_vertex, cupy_partition, modularity = result + df = cudf.DataFrame() + df["vertex"] = cupy_vertex + df["partition"] = cupy_partition + + return df, modularity + + +def _call_plc_ecg( + sID: bytes, + mg_graph_x, + max_iter: int, + resolution: int, + random_state: int, + theta: int, + do_expensive_check: bool, +) -> Tuple[cp.ndarray, cp.ndarray, float]: + return pylibcugraph_ecg( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + random_state=random_state, + graph=mg_graph_x, + max_level=max_iter, + resolution=resolution, + theta=theta, + do_expensive_check=do_expensive_check, + ) + + +def ecg( + input_graph, + min_weight: float = 0.0001, + ensemble_size: int = 100, + max_level: int = 10, + threshold: float = 1e-7, + resolution: float = 1.0, + random_state: int = None, + weight=None, +) -> Tuple[dask_cudf.DataFrame, float]: + """ + Compute the Ensemble Clustering for Graphs (ECG) partition of the input + graph. ECG runs truncated Louvain on an ensemble of permutations of the + input graph, then uses the ensemble partitions to determine weights for + the input graph. The final result is found by running full Louvain on + the input graph using the determined weights. + + See https://arxiv.org/abs/1809.05578 for further information. + + Parameters + ---------- + input_graph : cugraph.Graph or NetworkX Graph + The graph descriptor should contain the connectivity information + and weights. The adjacency list will be computed if not already + present. + + min_weight : float, optional (default=0.5) + The minimum value to assign as an edgeweight in the ECG algorithm. + It should be a value in the range [0,1] usually left as the default + value of .05 + + ensemble_size : integer, optional (default=16) + The number of graph permutations to use for the ensemble. + The default value is 16, larger values may produce higher quality + partitions for some graphs. + + max_level : integer, optional (default=100) + This controls the maximum number of levels/iterations of the ECG + algorithm. When specified the algorithm will terminate after no more + than the specified number of iterations. No error occurs when the + algorithm terminates early in this manner. + + threshold: float + Modularity gain threshold for each level. If the gain of + modularity between 2 levels of the algorithm is less than the + given threshold then the algorithm stops and returns the + resulting communities. + Defaults to 1e-7. + + resolution: float, optional (default=1.0) + Called gamma in the modularity formula, this changes the size + of the communities. Higher resolutions lead to more smaller + communities, lower resolutions lead to fewer larger communities. + Defaults to 1. + + random_state: int, optional(default=None) + Random state to use when generating samples. Optional argument, + defaults to a hash of process id, time, and hostname. + + weight : str, optional (default=None) + Deprecated. + This parameter is here for NetworkX compatibility and + represents which NetworkX data column represents Edge weights. + + Returns + ------- + parts : dask_cudf.DataFrame + GPU data frame of size V containing two columns the vertex id and the + partition id it is assigned to. + + ddf['vertex'] : cudf.Series + Contains the vertex identifiers + ddf['partition'] : cudf.Series + Contains the partition assigned to the vertices + + modularity_score : float + a floating point number containing the global modularity score of the + partitioning. + + Examples + -------- + >>> import cugraph.dask as dcg + >>> import dask_cudf + >>> # ... Init a DASK Cluster + >>> # see https://docs.rapids.ai/api/cugraph/stable/dask-cugraph.html + >>> # Download dataset from https://github.com/rapidsai/cugraph/datasets/.. + >>> chunksize = dcg.get_chunksize(datasets_path / "karate.csv") + >>> ddf = dask_cudf.read_csv(datasets_path / "karate.csv", + ... blocksize=chunksize, delimiter=" ", + ... names=["src", "dst", "value"], + ... dtype=["int32", "int32", "float32"]) + >>> dg = cugraph.Graph() + >>> dg.from_dask_cudf_edgelist(ddf, source='src', destination='dst') + >>> parts, modularity_score = dcg.ecg(dg) + + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Return a client if one has started + client = default_client() + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_ecg, + Comms.get_session_id(), + input_graph._plc_graph[w], + max_iter, + resolution, + random_state, + theta, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + part_mod_score = [client.submit(convert_to_cudf, r) for r in result] + wait(part_mod_score) + + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes.iloc[0] + empty_df = cudf.DataFrame( + { + "vertex": numpy.empty(shape=0, dtype=vertex_dtype), + "partition": numpy.empty(shape=0, dtype="int32"), + } + ) + + part_mod_score = [delayed(lambda x: x, nout=2)(r) for r in part_mod_score] + + ddf = dask_cudf.from_delayed( + [r[0] for r in part_mod_score], meta=empty_df, verify_meta=False + ).persist() + + mod_score = dask.array.from_delayed( + part_mod_score[0][1], shape=(1,), dtype=float + ).compute() + + wait(ddf) + wait(mod_score) + + wait([r.release() for r in part_mod_score]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "vertex") + + return ddf, mod_score From dfe8dd2193e10e50cababc6a49f84ef2e38160f1 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 15 Jan 2025 19:34:55 -0800 Subject: [PATCH 50/60] update MG ecg implementation --- python/cugraph/cugraph/dask/__init__.py | 1 + .../cugraph/dask/community/__init__.py | 1 + python/cugraph/cugraph/dask/community/ecg.py | 28 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index 617eb25a2b..69aba2c8aa 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -45,6 +45,7 @@ from .link_prediction.cosine import cosine from .link_prediction.cosine import all_pairs_cosine from .community.leiden import leiden +from .community.ecg import ecg # Avoid "p2p" shuffling in dask for now config.set({"dataframe.shuffle.method": "tasks"}) diff --git a/python/cugraph/cugraph/dask/community/__init__.py b/python/cugraph/cugraph/dask/community/__init__.py index 9b5301d0e4..146e837bd8 100644 --- a/python/cugraph/cugraph/dask/community/__init__.py +++ b/python/cugraph/cugraph/dask/community/__init__.py @@ -16,3 +16,4 @@ from .induced_subgraph import induced_subgraph from .leiden import leiden from .ktruss_subgraph import ktruss_subgraph +from .ecg import ecg diff --git a/python/cugraph/cugraph/dask/community/ecg.py b/python/cugraph/cugraph/dask/community/ecg.py index 8556b4da78..3ed9947783 100644 --- a/python/cugraph/cugraph/dask/community/ecg.py +++ b/python/cugraph/cugraph/dask/community/ecg.py @@ -47,19 +47,23 @@ def convert_to_cudf(result: cp.ndarray) -> Tuple[cudf.DataFrame, float]: def _call_plc_ecg( sID: bytes, mg_graph_x, - max_iter: int, + min_weight: float, + ensemble_size: int, + max_level: int, + threshold: float, resolution: int, random_state: int, - theta: int, do_expensive_check: bool, ) -> Tuple[cp.ndarray, cp.ndarray, float]: return pylibcugraph_ecg( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), random_state=random_state, graph=mg_graph_x, - max_level=max_iter, + min_weight=min_weight, + ensemble_size=ensemble_size, + max_level=max_level, + threshold=threshold, resolution=resolution, - theta=theta, do_expensive_check=do_expensive_check, ) @@ -71,8 +75,7 @@ def ecg( max_level: int = 10, threshold: float = 1e-7, resolution: float = 1.0, - random_state: int = None, - weight=None, + random_state: int = None ) -> Tuple[dask_cudf.DataFrame, float]: """ Compute the Ensemble Clustering for Graphs (ECG) partition of the input @@ -123,10 +126,6 @@ def ecg( Random state to use when generating samples. Optional argument, defaults to a hash of process id, time, and hostname. - weight : str, optional (default=None) - Deprecated. - This parameter is here for NetworkX compatibility and - represents which NetworkX data column represents Edge weights. Returns ------- @@ -161,9 +160,6 @@ def ecg( """ - if input_graph.is_directed(): - raise ValueError("input graph must be undirected") - # Return a client if one has started client = default_client() @@ -174,10 +170,12 @@ def ecg( _call_plc_ecg, Comms.get_session_id(), input_graph._plc_graph[w], - max_iter, + min_weight, + ensemble_size, + max_level, + threshold, resolution, random_state, - theta, do_expensive_check, workers=[w], allow_other_workers=False, From 5015e30e7f79e438d8288b069b265487d0b3f510 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 07:06:18 -0800 Subject: [PATCH 51/60] add mg ecg tests --- .../cugraph/tests/community/test_ecg_mg.py | 106 ++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 python/cugraph/cugraph/tests/community/test_ecg_mg.py diff --git a/python/cugraph/cugraph/tests/community/test_ecg_mg.py b/python/cugraph/cugraph/tests/community/test_ecg_mg.py new file mode 100644 index 0000000000..356c812fea --- /dev/null +++ b/python/cugraph/cugraph/tests/community/test_ecg_mg.py @@ -0,0 +1,106 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +import cugraph +import cugraph.dask as dcg +from cugraph.datasets import karate, dolphins, netscience + + +# ============================================================================= +# Parameters +# ============================================================================= + + +DATASETS = [dolphins, karate, netscience] + +MIN_WEIGHTS = [0.05, 0.15] + +ENSEMBLE_SIZES = [16, 32] + +MAX_LEVELS = [10, 20] + +RESOLUTIONS = [0.95, 1.0] + +THRESHOLDS = [1e-6, 1e-07] + +RANDOM_STATES = [0, 42] + + +# ============================================================================= +# Helper Functions +# ============================================================================= + + +def get_mg_graph(dataset, directed): + """Returns an MG graph""" + ddf = dataset.get_dask_edgelist() + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist(ddf, "src", "dst", "wgt") + + return dg + + +def golden_call(filename): + if filename == "dolphins": + return 0.4962422251701355 + if filename == "karate": + return 0.38428664207458496 + if filename == "netscience": + return 0.9279554486274719 + + +# ============================================================================= +# Tests +# ============================================================================= +# FIXME: Implement more robust tests + + +@pytest.mark.mg +@pytest.mark.parametrize("dataset", DATASETS) +@pytest.mark.parametrize("min_weight", MIN_WEIGHTS) +@pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES) +@pytest.mark.parametrize("max_level", MAX_LEVELS) +@pytest.mark.parametrize("threshold", THRESHOLDS) +@pytest.mark.parametrize("resolution", RESOLUTIONS) +@pytest.mark.parametrize("random_state", RANDOM_STATES) +def test_mg_ecg( + dask_client, dataset, min_weight, ensemble_size, max_level, threshold, resolution, random_state): + filename = dataset.metadata["name"] + dg = get_mg_graph(dataset, directed=False) + parts, mod = dcg.ecg( + dg, + min_weight=min_weight, + ensemble_size=ensemble_size, + max_level=max_level, + threshold=threshold, + resolution=resolution, + random_state=random_state) + + filename = dataset.metadata["name"] + golden_score = golden_call(filename) + + # Assert that the partitioning has better modularity than the random + # assignment + assert mod > (0.80 * golden_score) + + #print("mod score = ", mod) + + # FIXME: either call Nx with the same dataset and compare results, or + # hardcode golden results to compare to. + print() + print(parts.compute()) + print(mod) + print() From 92bfb5771ef78d37fead4de4c55c9e13fa2efd0f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 08:30:37 -0800 Subject: [PATCH 52/60] pass different random seeds to each GPU --- python/cugraph/cugraph/dask/community/ecg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/dask/community/ecg.py b/python/cugraph/cugraph/dask/community/ecg.py index 3ed9947783..63609a7ca1 100644 --- a/python/cugraph/cugraph/dask/community/ecg.py +++ b/python/cugraph/cugraph/dask/community/ecg.py @@ -175,12 +175,12 @@ def ecg( max_level, threshold, resolution, - random_state, + (random_state + i) if random_state is not None else random_state, do_expensive_check, workers=[w], allow_other_workers=False, ) - for w in Comms.get_workers() + for i, w in enumerate(Comms.get_workers()) ] wait(result) From 1282d52fdc188a47b803ed39f2292b5d4e2cc8cf Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 08:31:03 -0800 Subject: [PATCH 53/60] add mg tests for biased random walks --- .../sampling/test_biased_random_walks_mg.py | 212 ++++++++++++++++++ .../sampling/test_uniform_random_walks_mg.py | 212 ++++++++++++++++++ 2 files changed, 424 insertions(+) create mode 100644 python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py create mode 100644 python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py diff --git a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py new file mode 100644 index 0000000000..db51e6ca79 --- /dev/null +++ b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py @@ -0,0 +1,212 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest + +import cugraph +import dask_cudf +import cugraph.dask as dcg +import cudf +from cugraph.testing import SMALL_DATASETS +from cugraph.datasets import karate_asymmetric +from cugraph.structure.symmetrize import symmetrize +from pylibcugraph.testing.utils import gen_fixture_params_product + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +IS_DIRECTED = [True, False] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + +datasets = SMALL_DATASETS + [karate_asymmetric] + +fixture_params = gen_fixture_params_product( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), +) + + +def calc_biased_random_walks(G): + """ + compute random walks + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed (DiGraph) or undirected (Graph). + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + max_path_length : int + The maximum path length + + start_vertices : list + Roots for the random walks + + max_depth : int + """ + k = random.randint(1, 4) + max_depth = random.randint(2, 4) + + start_vertices = G.nodes().compute().sample(k).reset_index(drop=True) + + vertex_paths, edge_weights, max_path_length = dcg.biased_random_walks( + G, start_vertices, max_depth + ) + + return (vertex_paths, edge_weights, max_path_length), start_vertices, max_depth + + +def check_biased_random_walks(G, path_data, seeds, max_depth, df_G=None): + invalid_edge = 0 + invalid_edge_wgt_path = 0 + invalid_seeds = 0 + next_path_idx = 0 + invalid_edge_wgt_path = 0 + e_wgt_path_idx = 0 + v_paths = path_data[0].compute() + e_paths = path_data[1].compute() + + max_path_length = path_data[2] + sizes = max_path_length + + for _ in range(len(seeds)): + for i in range(next_path_idx, next_path_idx + sizes): + src, dst = v_paths.iloc[i], v_paths.iloc[i + 1] + + if i == next_path_idx and src not in seeds.values: + invalid_seeds += 1 + print("[ERR] Invalid seed: " " src {} != src {}".format(src, seeds)) + + else: + # If everything is good proceed to the next part + # now check the destination + + # find the src out_degree to ensure it effectively has no outgoing edges + # No need to check for -1 values, move to the next iteration + if src != -1: + src_degree = G.out_degree([src])["degree"].compute()[0] + if dst == -1 and src_degree == 0: + if e_paths.values[e_wgt_path_idx] != 0: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 0 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + else: + exp_edge = df_G.loc[ + (df_G["src"] == (src)) & (df_G["dst"] == (dst)) + ].reset_index(drop=True) + + if len(exp_edge) == 0: + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}".format(src, dst) + ) + invalid_edge += 1 + else: + # This is a valid edge, check the edge_wgt_path + if e_paths.values[e_wgt_path_idx] != 1: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 1 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + else: + # v_path: src == -1, dst == -1 => e_wgt_path=0 otherwise ERROR + if e_paths.values[e_wgt_path_idx] != 0: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 0 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + + e_wgt_path_idx += 1 + next_path_idx += sizes + 1 + + assert invalid_edge == 0 + assert invalid_seeds == 0 + assert invalid_edge_wgt_path == 0 + assert max_path_length == max_depth + + +@pytest.fixture(scope="module", params=fixture_params) +def input_graph(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict(zip(("graph_file", "directed"), request.param)) + input_data_path = parameters["graph_file"].get_path() + directed = parameters["directed"] + + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value", + renumber=True, + store_transposed=True, + ) + + return dg + + +@pytest.mark.mg +def test_dask_mg_biased_random_walks(dask_client, input_graph): + path_data, seeds, max_depth = calc_biased_random_walks(input_graph) + df_G = input_graph.input_df.compute().reset_index(drop=True) + + df_G = input_graph.decompress_to_edgelist( + return_unrenumbered_edgelist=True).compute().reset_index(drop=True) + + check_biased_random_walks(input_graph, path_data, seeds, max_depth, df_G) diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py new file mode 100644 index 0000000000..50aeb0ec84 --- /dev/null +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py @@ -0,0 +1,212 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest + +import cugraph +import dask_cudf +import cugraph.dask as dcg +import cudf +from cugraph.testing import SMALL_DATASETS +from cugraph.datasets import karate_asymmetric +from cugraph.structure.symmetrize import symmetrize +from pylibcugraph.testing.utils import gen_fixture_params_product + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +IS_DIRECTED = [True, False] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + +datasets = SMALL_DATASETS + [karate_asymmetric] + +fixture_params = gen_fixture_params_product( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), +) + + +def calc_uniform_random_walks(G): + """ + compute random walks + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed (DiGraph) or undirected (Graph). + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + max_path_length : int + The maximum path length + + start_vertices : list + Roots for the random walks + + max_depth : int + """ + k = random.randint(1, 4) + max_depth = random.randint(2, 4) + + start_vertices = G.nodes().compute().sample(k).reset_index(drop=True) + + vertex_paths, edge_weights, max_path_length = dcg.uniform_random_walks( + G, start_vertices, max_depth + ) + + return (vertex_paths, edge_weights, max_path_length), start_vertices, max_depth + + +def check_uniform_random_walks(G, path_data, seeds, max_depth, df_G=None): + invalid_edge = 0 + invalid_edge_wgt_path = 0 + invalid_seeds = 0 + next_path_idx = 0 + invalid_edge_wgt_path = 0 + e_wgt_path_idx = 0 + v_paths = path_data[0].compute() + e_paths = path_data[1].compute() + + max_path_length = path_data[2] + sizes = max_path_length + + for _ in range(len(seeds)): + for i in range(next_path_idx, next_path_idx + sizes): + src, dst = v_paths.iloc[i], v_paths.iloc[i + 1] + + if i == next_path_idx and src not in seeds.values: + invalid_seeds += 1 + print("[ERR] Invalid seed: " " src {} != src {}".format(src, seeds)) + + else: + # If everything is good proceed to the next part + # now check the destination + + # find the src out_degree to ensure it effectively has no outgoing edges + # No need to check for -1 values, move to the next iteration + if src != -1: + src_degree = G.out_degree([src])["degree"].compute()[0] + if dst == -1 and src_degree == 0: + if e_paths.values[e_wgt_path_idx] != 0: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 0 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + else: + exp_edge = df_G.loc[ + (df_G["src"] == (src)) & (df_G["dst"] == (dst)) + ].reset_index(drop=True) + + if len(exp_edge) == 0: + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}".format(src, dst) + ) + invalid_edge += 1 + else: + # This is a valid edge, check the edge_wgt_path + if e_paths.values[e_wgt_path_idx] != 1: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 1 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + else: + # v_path: src == -1, dst == -1 => e_wgt_path=0 otherwise ERROR + if e_paths.values[e_wgt_path_idx] != 0: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 0 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + + e_wgt_path_idx += 1 + next_path_idx += sizes + 1 + + assert invalid_edge == 0 + assert invalid_seeds == 0 + assert invalid_edge_wgt_path == 0 + assert max_path_length == max_depth + + +@pytest.fixture(scope="module", params=fixture_params) +def input_graph(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict(zip(("graph_file", "directed"), request.param)) + input_data_path = parameters["graph_file"].get_path() + directed = parameters["directed"] + + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value", + renumber=True, + store_transposed=True, + ) + + return dg + + +@pytest.mark.mg +def test_dask_mg_uniform_random_walks(dask_client, input_graph): + path_data, seeds, max_depth = calc_uniform_random_walks(input_graph) + df_G = input_graph.input_df.compute().reset_index(drop=True) + + df_G = input_graph.decompress_to_edgelist( + return_unrenumbered_edgelist=True).compute().reset_index(drop=True) + + check_uniform_random_walks(input_graph, path_data, seeds, max_depth, df_G) From 79d7b5cb4f5a8286e424e0c158524d816643f20a Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 08:39:49 -0800 Subject: [PATCH 54/60] add node2vec mg tests --- .../sampling/test_node2vec_random_walks_mg.py | 212 ++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py new file mode 100644 index 0000000000..ad6b2022e7 --- /dev/null +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py @@ -0,0 +1,212 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest + +import cugraph +import dask_cudf +import cugraph.dask as dcg +import cudf +from cugraph.testing import SMALL_DATASETS +from cugraph.datasets import karate_asymmetric +from cugraph.structure.symmetrize import symmetrize +from pylibcugraph.testing.utils import gen_fixture_params_product + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +IS_DIRECTED = [True, False] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + +datasets = SMALL_DATASETS + [karate_asymmetric] + +fixture_params = gen_fixture_params_product( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), +) + + +def calc_node2vec_random_walks(G): + """ + compute random walks + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed (DiGraph) or undirected (Graph). + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + max_path_length : int + The maximum path length + + start_vertices : list + Roots for the random walks + + max_depth : int + """ + k = random.randint(1, 4) + max_depth = random.randint(2, 4) + + start_vertices = G.nodes().compute().sample(k).reset_index(drop=True) + + vertex_paths, edge_weights, max_path_length = dcg.node2vec_random_walks( + G, start_vertices, max_depth + ) + + return (vertex_paths, edge_weights, max_path_length), start_vertices, max_depth + + +def check_node2vec_random_walks(G, path_data, seeds, max_depth, df_G=None): + invalid_edge = 0 + invalid_edge_wgt_path = 0 + invalid_seeds = 0 + next_path_idx = 0 + invalid_edge_wgt_path = 0 + e_wgt_path_idx = 0 + v_paths = path_data[0].compute() + e_paths = path_data[1].compute() + + max_path_length = path_data[2] + sizes = max_path_length + + for _ in range(len(seeds)): + for i in range(next_path_idx, next_path_idx + sizes): + src, dst = v_paths.iloc[i], v_paths.iloc[i + 1] + + if i == next_path_idx and src not in seeds.values: + invalid_seeds += 1 + print("[ERR] Invalid seed: " " src {} != src {}".format(src, seeds)) + + else: + # If everything is good proceed to the next part + # now check the destination + + # find the src out_degree to ensure it effectively has no outgoing edges + # No need to check for -1 values, move to the next iteration + if src != -1: + src_degree = G.out_degree([src])["degree"].compute()[0] + if dst == -1 and src_degree == 0: + if e_paths.values[e_wgt_path_idx] != 0: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 0 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + else: + exp_edge = df_G.loc[ + (df_G["src"] == (src)) & (df_G["dst"] == (dst)) + ].reset_index(drop=True) + + if len(exp_edge) == 0: + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}".format(src, dst) + ) + invalid_edge += 1 + else: + # This is a valid edge, check the edge_wgt_path + if e_paths.values[e_wgt_path_idx] != 1: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 1 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + else: + # v_path: src == -1, dst == -1 => e_wgt_path=0 otherwise ERROR + if e_paths.values[e_wgt_path_idx] != 0: + wgt = e_paths.values[e_wgt_path_idx] + print( + "[ERR] Invalid edge weight path: " + "Edge src {} dst {} has wgt 0 " + "But got wgt {}".format(src, dst, wgt) + ) + invalid_edge_wgt_path += 1 + + e_wgt_path_idx += 1 + next_path_idx += sizes + 1 + + assert invalid_edge == 0 + assert invalid_seeds == 0 + assert invalid_edge_wgt_path == 0 + assert max_path_length == max_depth + + +@pytest.fixture(scope="module", params=fixture_params) +def input_graph(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict(zip(("graph_file", "directed"), request.param)) + input_data_path = parameters["graph_file"].get_path() + directed = parameters["directed"] + + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value", + renumber=True, + store_transposed=True, + ) + + return dg + + +@pytest.mark.mg +def test_dask_mg_node2vec_random_walks(dask_client, input_graph): + path_data, seeds, max_depth = calc_node2vec_random_walks(input_graph) + df_G = input_graph.input_df.compute().reset_index(drop=True) + + df_G = input_graph.decompress_to_edgelist( + return_unrenumbered_edgelist=True).compute().reset_index(drop=True) + + check_node2vec_random_walks(input_graph, path_data, seeds, max_depth, df_G) From b22d4c8590bfed11bb180a76a0e34ad634c92199 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 08:50:46 -0800 Subject: [PATCH 55/60] add sg node2vec tests --- .../sampling/test_biased_random_walks.py | 2 +- .../sampling/test_node2vec_random_walks.py | 274 ++++++++++++++++++ .../sampling/test_uniform_random_walks.py | 2 +- 3 files changed, 276 insertions(+), 2 deletions(-) create mode 100644 python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py diff --git a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py index b6097aa1ed..cbd4b48329 100644 --- a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py @@ -219,7 +219,7 @@ def test_biased_random_walks(graph_file, directed): @pytest.mark.sg @pytest.mark.parametrize("graph_file", SMALL_DATASETS) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_random_walks_multi_column_seeds( +def test_biased_random_walks_multi_column_seeds( graph_file, directed ): diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py new file mode 100644 index 0000000000..de46ec72eb --- /dev/null +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py @@ -0,0 +1,274 @@ +# Copyright (c) 2025, NVIDIA CORPORATION.: +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest +import networkx as nx + +import cudf +import cugraph +from cudf.testing import assert_series_equal +from cudf.testing.testing import assert_frame_equal +from cugraph.utilities import ensure_cugraph_obj_for_nx +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils + + +# ============================================================================= +# Parameters +# ============================================================================= +DIRECTED_GRAPH_OPTIONS = [False, True] +WEIGHTED_GRAPH_OPTIONS = [False, True] +DATASETS = [pytest.param(d) for d in DEFAULT_DATASETS] +SMALL_DATASETS = [pytest.param(d) for d in SMALL_DATASETS] + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + +def calc_node2vec_random_walks(G, max_depth=None): + """ + compute random walks for each nodes in 'start_vertices' + + parameters + ---------- + G : cuGraph.Graph or networkx.Graph + The graph can be either directed or undirected. + Weights in the graph are ignored. + Use weight parameter if weights need to be considered + (currently not supported) + + max_depth : int + The maximum depth of the random walks + + Returns + ------- + vertex_paths : cudf.Series or cudf.DataFrame + Series containing the vertices of edges/paths in the random walk. + + edge_weight_paths: cudf.Series + Series containing the edge weights of edges represented by the + returned vertex_paths + + sizes: int + The path size in case of coalesced paths. + """ + assert G is not None + + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") + + k = random.randint(1, 6) + + random_walks_type = "node2vec" + + start_vertices = G.select_random_vertices(num_vertices=k) + + print("\nstart_vertices is \n", start_vertices) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec_random_walks( + G, start_vertices, max_depth + ) + + return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices + + + + +def check_node2vec_random_walks(G, path_data, seeds, max_depth): + invalid_edge = 0 + invalid_seeds = 0 + invalid_edge_wgt = 0 + v_paths = path_data[0] + e_wgt_paths = path_data[1] + e_wgt_idx = 0 + + G, _ = ensure_cugraph_obj_for_nx(G, nx_weight_attr="wgt") + df_G = G.input_df + + if "weight" in df_G.columns: + df_G = df_G.rename(columns={"weight": "wgt"}) + + total_depth = (max_depth) * len(seeds) + + for i in range(total_depth): + if isinstance(seeds, cudf.DataFrame): + vertex_1 = v_paths.iloc[[i]].reset_index(drop=True) + vertex_2 = v_paths.iloc[[i + 1]].reset_index(drop=True) + else: + vertex_1, vertex_2 = v_paths.iloc[i], v_paths.iloc[i + 1] + + # Every max_depth'th vertex in 'v_paths' is a seed instead of + # 'seeds[i // (max_depth + 1)]', could have just pop the first element + # of the seeds array once there is a match and compare it to 'vertex_1' + + if i % (max_depth + 1) == 0: + if isinstance(seeds, cudf.DataFrame): + assert_frame_equal( + vertex_1.rename(columns={x:y for x,y in zip(vertex_1.columns,range(0,len(vertex_1.columns)))}), + seeds.iloc[[i // (max_depth + 1)]].reset_index(drop=True).rename(columns={x:y for x,y in zip(seeds.columns,range(0,len(seeds.columns)))}), + check_dtype=False, check_like=True) + else: + if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: + invalid_seeds += 1 + print( + "[ERR] Invalid seed: " + " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) + ) + + if (i % (max_depth + 1)) != (max_depth): + # These are the edges + src = vertex_1 + dst = vertex_2 + + # check for valid edge. + if isinstance(seeds, cudf.DataFrame): + if (-1 not in src.iloc[0].reset_index(drop=True)) and (-1 not in dst.iloc[0].reset_index(drop=True)): + edge = cudf.DataFrame() + edge["src"] = vertex_1["0_vertex_paths"] + edge["src_0"] = vertex_1["1_vertex_paths"] + edge["dst"] = vertex_2["0_vertex_paths"] + edge["dst_0"] = vertex_2["1_vertex_paths"] + + join1 = cudf.merge(df_G, edge, on=[*edge.columns]) + + assert len(cudf.merge(df_G, edge, on=[*edge.columns])) > 0 + else: + edge = df_G.loc[ + (df_G["src"] == (src)) & (df_G["dst"] == (dst)) + ].reset_index(drop=True) + + if len(edge) == 0: + print( + "[ERR] Invalid edge: " + "There is no edge src {} dst {}".format(src, dst) + ) + invalid_edge += 1 + + else: + # check valid edge wgt + if G.is_weighted(): + expected_wgt = edge["wgt"].iloc[0] + result_wgt = e_wgt_paths.iloc[e_wgt_idx] + + if expected_wgt != result_wgt: + print( + "[ERR] Invalid edge wgt: " + "The edge src {} dst {} has wgt {} but got {}".format( + src, dst, expected_wgt, result_wgt + ) + ) + invalid_edge_wgt += 1 + e_wgt_idx += 1 + + if src != -1 and dst == -1: + # ensure there is no outgoing edges from 'src' + assert G.out_degree([src])["degree"].iloc[0] == 0 + + assert invalid_seeds == 0 + assert invalid_edge == 0 + assert len(v_paths) == (max_depth + 1) * len(seeds) + if G.is_weighted(): + assert invalid_edge_wgt == 0 + assert len(e_wgt_paths) == (max_depth) * len(seeds) + + + max_path_lenth = path_data[2] + assert max_path_lenth == max_depth + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +@pytest.mark.parametrize("max_depth", [None]) +def test_node2vec_random_walks_invalid_max_dept(graph_file, directed, max_depth): + + input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) + with pytest.raises(ValueError): + _, _, _ = calc_node2vec_random_walks(input_graph, max_depth=max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_node2vec_random_walks(graph_file, directed): + max_depth = random.randint(2, 10) + print("max_depth is ", max_depth) + input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) + + path_data, seeds = calc_node2vec_random_walks( + input_graph, max_depth=max_depth + ) + + check_node2vec_random_walks(input_graph, path_data, seeds, max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +def test_node2vec_random_walks_nx(graph_file): + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) + + M = G.to_pandas_edgelist() + + source = G.source_columns + target = G.destination_columns + edge_attr = G.weight_column + + Gnx = nx.from_pandas_edgelist( + M, + source=source, + target=target, + edge_attr=edge_attr, + create_using=nx.DiGraph(), + ) + max_depth = random.randint(2, 10) + path_data, seeds = calc_node2vec_random_walks(Gnx, max_depth=max_depth) + + check_node2vec_random_walks(Gnx, path_data, seeds, max_depth) + + +@pytest.mark.sg +@pytest.mark.parametrize("graph_file", SMALL_DATASETS) +@pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) +def test_node2vec_random_walks_multi_column_seeds( + graph_file, + directed +): + max_depth = random.randint(2, 10) + df_G = graph_file.get_edgelist() + df_G.rename( + columns={"wgt": "weight"}, inplace=True) + df_G['src_0'] = df_G['src'] + 1000 + df_G['dst_0'] = df_G['dst'] + 1000 + + if directed: + G = cugraph.Graph(directed=True) + else: + G = cugraph.Graph() + G.from_cudf_edgelist(df_G, source=['src', 'src_0'], + destination=['dst', 'dst_0'], + edge_attr="weight") + + k = random.randint(1, 10) + + seeds = G.select_random_vertices(num_vertices=k) + vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec_random_walks( + G, seeds, max_depth) + + path_data = (vertex_paths, edge_weights, vertex_path_sizes) + + check_node2vec_random_walks(G, path_data, seeds, max_depth) + \ No newline at end of file diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py index 7fca738575..ef2fc00af1 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py @@ -243,7 +243,7 @@ def test_uniform_random_walks_nx(graph_file): @pytest.mark.sg @pytest.mark.parametrize("graph_file", SMALL_DATASETS) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_random_walks_multi_column_seeds( +def test_uniform_random_walks_multi_column_seeds( graph_file, directed ): From c1f6ff0593afbdc8896e134f05a3a444ee0aec0c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 09:06:25 -0800 Subject: [PATCH 56/60] deprecate tests --- python/cugraph/cugraph/tests/sampling/test_node2vec.py | 2 ++ python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec.py b/python/cugraph/cugraph/tests/sampling/test_node2vec.py index 00c3270533..060a3015a7 100644 --- a/python/cugraph/cugraph/tests/sampling/test_node2vec.py +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec.py @@ -48,6 +48,8 @@ def _get_param_args(param_name, param_values): return (param_name, [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) +# FIXME: This test suite must be removed once node2vec is removed from +# the python API in favor of node2vecrandom walks def calc_node2vec(G, start_vertices, max_depth, compress_result, p=1.0, q=1.0): """ Compute node2vec for each nodes in 'start_vertices' diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py index 96b34c638b..033e6760e0 100644 --- a/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py @@ -50,6 +50,8 @@ def setup_function(): ) +# FIXME: This test suite must be removed once random_walks is removed from +# the python API in favor of uniform random walks def calc_random_walks(G): """ compute random walks From 75e1253b27c6a50d817c0efb8b0ddc83ace730bb Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 10:37:35 -0800 Subject: [PATCH 57/60] fix style --- .../cugraph/dask/community/__init__.py | 2 +- python/cugraph/cugraph/dask/community/ecg.py | 9 +-- .../cugraph/sampling/node2vec_random_walks.py | 2 +- .../graph_implementation/simpleGraph.py | 2 +- .../cugraph/tests/community/test_ecg_mg.py | 17 ++++- .../sampling/test_biased_random_walks.py | 75 ++++++++++--------- .../sampling/test_biased_random_walks_mg.py | 11 +-- .../cugraph/tests/sampling/test_node2vec.py | 2 +- .../sampling/test_node2vec_random_walks.py | 74 +++++++++--------- .../sampling/test_node2vec_random_walks_mg.py | 11 +-- .../tests/sampling/test_random_walks_mg.py | 2 +- .../sampling/test_uniform_random_walks.py | 74 +++++++++--------- .../sampling/test_uniform_random_walks_mg.py | 11 +-- 13 files changed, 159 insertions(+), 133 deletions(-) diff --git a/python/cugraph/cugraph/dask/community/__init__.py b/python/cugraph/cugraph/dask/community/__init__.py index 146e837bd8..4d848385c4 100644 --- a/python/cugraph/cugraph/dask/community/__init__.py +++ b/python/cugraph/cugraph/dask/community/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/dask/community/ecg.py b/python/cugraph/cugraph/dask/community/ecg.py index 63609a7ca1..6f9f716402 100644 --- a/python/cugraph/cugraph/dask/community/ecg.py +++ b/python/cugraph/cugraph/dask/community/ecg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -26,10 +26,7 @@ from pylibcugraph import ecg as pylibcugraph_ecg import numpy import cupy as cp -from typing import Tuple, TYPE_CHECKING - -if TYPE_CHECKING: - from cugraph import Graph +from typing import Tuple def convert_to_cudf(result: cp.ndarray) -> Tuple[cudf.DataFrame, float]: @@ -75,7 +72,7 @@ def ecg( max_level: int = 10, threshold: float = 1e-7, resolution: float = 1.0, - random_state: int = None + random_state: int = None, ) -> Tuple[dask_cudf.DataFrame, float]: """ Compute the Ensemble Clustering for Graphs (ECG) partition of the input diff --git a/python/cugraph/cugraph/sampling/node2vec_random_walks.py b/python/cugraph/cugraph/sampling/node2vec_random_walks.py index 68e9f3f072..5e7c352534 100644 --- a/python/cugraph/cugraph/sampling/node2vec_random_walks.py +++ b/python/cugraph/cugraph/sampling/node2vec_random_walks.py @@ -161,5 +161,5 @@ def node2vec_random_walks( vertex_paths = df_.fillna(-1) else: vertex_paths = cudf.Series(df_["vertex_paths"]).fillna(-1) - + return vertex_paths, edge_wgt_paths, max_depth diff --git a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py index d9beba70e4..8086c9ddb4 100644 --- a/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py +++ b/python/cugraph/cugraph/structure/graph_implementation/simpleGraph.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/tests/community/test_ecg_mg.py b/python/cugraph/cugraph/tests/community/test_ecg_mg.py index 356c812fea..18f4cc4df3 100644 --- a/python/cugraph/cugraph/tests/community/test_ecg_mg.py +++ b/python/cugraph/cugraph/tests/community/test_ecg_mg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -77,7 +77,15 @@ def golden_call(filename): @pytest.mark.parametrize("resolution", RESOLUTIONS) @pytest.mark.parametrize("random_state", RANDOM_STATES) def test_mg_ecg( - dask_client, dataset, min_weight, ensemble_size, max_level, threshold, resolution, random_state): + dask_client, + dataset, + min_weight, + ensemble_size, + max_level, + threshold, + resolution, + random_state, +): filename = dataset.metadata["name"] dg = get_mg_graph(dataset, directed=False) parts, mod = dcg.ecg( @@ -87,7 +95,8 @@ def test_mg_ecg( max_level=max_level, threshold=threshold, resolution=resolution, - random_state=random_state) + random_state=random_state, + ) filename = dataset.metadata["name"] golden_score = golden_call(filename) @@ -96,7 +105,7 @@ def test_mg_ecg( # assignment assert mod > (0.80 * golden_score) - #print("mod score = ", mod) + # print("mod score = ", mod) # FIXME: either call Nx with the same dataset and compare results, or # hardcode golden results to compare to. diff --git a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py index cbd4b48329..4c12f981b1 100644 --- a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks.py @@ -15,14 +15,12 @@ import random import pytest -import networkx as nx import cudf import cugraph -from cudf.testing import assert_series_equal from cudf.testing.testing import assert_frame_equal from cugraph.utilities import ensure_cugraph_obj_for_nx -from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS # ============================================================================= @@ -74,8 +72,6 @@ def calc_biased_random_walks(G, max_depth=None): k = random.randint(1, 6) - random_walks_type = "biased" - start_vertices = G.select_random_vertices(num_vertices=k) print("\nstart_vertices is \n", start_vertices) @@ -86,8 +82,6 @@ def calc_biased_random_walks(G, max_depth=None): return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices - - def check_biased_random_walks(G, path_data, seeds, max_depth): invalid_edge = 0 invalid_seeds = 0 @@ -118,33 +112,51 @@ def check_biased_random_walks(G, path_data, seeds, max_depth): if i % (max_depth + 1) == 0: if isinstance(seeds, cudf.DataFrame): assert_frame_equal( - vertex_1.rename(columns={x:y for x,y in zip(vertex_1.columns,range(0,len(vertex_1.columns)))}), - seeds.iloc[[i // (max_depth + 1)]].reset_index(drop=True).rename(columns={x:y for x,y in zip(seeds.columns,range(0,len(seeds.columns)))}), - check_dtype=False, check_like=True) + vertex_1.rename( + columns={ + x: y + for x, y in zip( + vertex_1.columns, range(0, len(vertex_1.columns)) + ) + } + ), + seeds.iloc[[i // (max_depth + 1)]] + .reset_index(drop=True) + .rename( + columns={ + x: y + for x, y in zip(seeds.columns, range(0, len(seeds.columns))) + } + ), + check_dtype=False, + check_like=True, + ) else: if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: invalid_seeds += 1 print( "[ERR] Invalid seed: " - " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) - ) + " src {} != src {}".format( + vertex_1, seeds[i // (max_depth + 1)] + ) + ) if (i % (max_depth + 1)) != (max_depth): # These are the edges src = vertex_1 dst = vertex_2 - + # check for valid edge. if isinstance(seeds, cudf.DataFrame): - if (-1 not in src.iloc[0].reset_index(drop=True)) and (-1 not in dst.iloc[0].reset_index(drop=True)): + if (-1 not in src.iloc[0].reset_index(drop=True)) and ( + -1 not in dst.iloc[0].reset_index(drop=True) + ): edge = cudf.DataFrame() edge["src"] = vertex_1["0_vertex_paths"] edge["src_0"] = vertex_1["1_vertex_paths"] edge["dst"] = vertex_2["0_vertex_paths"] edge["dst_0"] = vertex_2["1_vertex_paths"] - join1 = cudf.merge(df_G, edge, on=[*edge.columns]) - assert len(cudf.merge(df_G, edge, on=[*edge.columns])) > 0 else: edge = df_G.loc[ @@ -185,7 +197,6 @@ def check_biased_random_walks(G, path_data, seeds, max_depth): assert invalid_edge_wgt == 0 assert len(e_wgt_paths) == (max_depth) * len(seeds) - max_path_lenth = path_data[2] assert max_path_lenth == max_depth @@ -209,9 +220,7 @@ def test_biased_random_walks(graph_file, directed): print("max_depth is ", max_depth) input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) - path_data, seeds = calc_biased_random_walks( - input_graph, max_depth=max_depth - ) + path_data, seeds = calc_biased_random_walks(input_graph, max_depth=max_depth) check_biased_random_walks(input_graph, path_data, seeds, max_depth) @@ -219,32 +228,28 @@ def test_biased_random_walks(graph_file, directed): @pytest.mark.sg @pytest.mark.parametrize("graph_file", SMALL_DATASETS) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_biased_random_walks_multi_column_seeds( - graph_file, - directed -): +def test_biased_random_walks_multi_column_seeds(graph_file, directed): max_depth = random.randint(2, 10) df_G = graph_file.get_edgelist() - df_G.rename( - columns={"wgt": "weight"}, inplace=True) - df_G['src_0'] = df_G['src'] + 1000 - df_G['dst_0'] = df_G['dst'] + 1000 + df_G.rename(columns={"wgt": "weight"}, inplace=True) + df_G["src_0"] = df_G["src"] + 1000 + df_G["dst_0"] = df_G["dst"] + 1000 if directed: G = cugraph.Graph(directed=True) else: G = cugraph.Graph() - G.from_cudf_edgelist(df_G, source=['src', 'src_0'], - destination=['dst', 'dst_0'], - edge_attr="weight") + G.from_cudf_edgelist( + df_G, source=["src", "src_0"], destination=["dst", "dst_0"], edge_attr="weight" + ) k = random.randint(1, 10) seeds = G.select_random_vertices(num_vertices=k) vertex_paths, edge_weights, vertex_path_sizes = cugraph.biased_random_walks( - G, seeds, max_depth) - + G, seeds, max_depth + ) + path_data = (vertex_paths, edge_weights, vertex_path_sizes) - + check_biased_random_walks(G, path_data, seeds, max_depth) - \ No newline at end of file diff --git a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py index db51e6ca79..5d4c8d445c 100644 --- a/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_biased_random_walks_mg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -19,10 +19,8 @@ import cugraph import dask_cudf import cugraph.dask as dcg -import cudf from cugraph.testing import SMALL_DATASETS from cugraph.datasets import karate_asymmetric -from cugraph.structure.symmetrize import symmetrize from pylibcugraph.testing.utils import gen_fixture_params_product @@ -206,7 +204,10 @@ def test_dask_mg_biased_random_walks(dask_client, input_graph): path_data, seeds, max_depth = calc_biased_random_walks(input_graph) df_G = input_graph.input_df.compute().reset_index(drop=True) - df_G = input_graph.decompress_to_edgelist( - return_unrenumbered_edgelist=True).compute().reset_index(drop=True) + df_G = ( + input_graph.decompress_to_edgelist(return_unrenumbered_edgelist=True) + .compute() + .reset_index(drop=True) + ) check_biased_random_walks(input_graph, path_data, seeds, max_depth, df_G) diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec.py b/python/cugraph/cugraph/tests/sampling/test_node2vec.py index 060a3015a7..92656d7b7d 100644 --- a/python/cugraph/cugraph/tests/sampling/test_node2vec.py +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py index de46ec72eb..cb98087c2f 100644 --- a/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks.py @@ -19,10 +19,9 @@ import cudf import cugraph -from cudf.testing import assert_series_equal from cudf.testing.testing import assert_frame_equal from cugraph.utilities import ensure_cugraph_obj_for_nx -from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS # ============================================================================= @@ -74,8 +73,6 @@ def calc_node2vec_random_walks(G, max_depth=None): k = random.randint(1, 6) - random_walks_type = "node2vec" - start_vertices = G.select_random_vertices(num_vertices=k) print("\nstart_vertices is \n", start_vertices) @@ -86,8 +83,6 @@ def calc_node2vec_random_walks(G, max_depth=None): return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices - - def check_node2vec_random_walks(G, path_data, seeds, max_depth): invalid_edge = 0 invalid_seeds = 0 @@ -118,33 +113,51 @@ def check_node2vec_random_walks(G, path_data, seeds, max_depth): if i % (max_depth + 1) == 0: if isinstance(seeds, cudf.DataFrame): assert_frame_equal( - vertex_1.rename(columns={x:y for x,y in zip(vertex_1.columns,range(0,len(vertex_1.columns)))}), - seeds.iloc[[i // (max_depth + 1)]].reset_index(drop=True).rename(columns={x:y for x,y in zip(seeds.columns,range(0,len(seeds.columns)))}), - check_dtype=False, check_like=True) + vertex_1.rename( + columns={ + x: y + for x, y in zip( + vertex_1.columns, range(0, len(vertex_1.columns)) + ) + } + ), + seeds.iloc[[i // (max_depth + 1)]] + .reset_index(drop=True) + .rename( + columns={ + x: y + for x, y in zip(seeds.columns, range(0, len(seeds.columns))) + } + ), + check_dtype=False, + check_like=True, + ) else: if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: invalid_seeds += 1 print( "[ERR] Invalid seed: " - " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) - ) + " src {} != src {}".format( + vertex_1, seeds[i // (max_depth + 1)] + ) + ) if (i % (max_depth + 1)) != (max_depth): # These are the edges src = vertex_1 dst = vertex_2 - + # check for valid edge. if isinstance(seeds, cudf.DataFrame): - if (-1 not in src.iloc[0].reset_index(drop=True)) and (-1 not in dst.iloc[0].reset_index(drop=True)): + if (-1 not in src.iloc[0].reset_index(drop=True)) and ( + -1 not in dst.iloc[0].reset_index(drop=True) + ): edge = cudf.DataFrame() edge["src"] = vertex_1["0_vertex_paths"] edge["src_0"] = vertex_1["1_vertex_paths"] edge["dst"] = vertex_2["0_vertex_paths"] edge["dst_0"] = vertex_2["1_vertex_paths"] - join1 = cudf.merge(df_G, edge, on=[*edge.columns]) - assert len(cudf.merge(df_G, edge, on=[*edge.columns])) > 0 else: edge = df_G.loc[ @@ -185,7 +198,6 @@ def check_node2vec_random_walks(G, path_data, seeds, max_depth): assert invalid_edge_wgt == 0 assert len(e_wgt_paths) == (max_depth) * len(seeds) - max_path_lenth = path_data[2] assert max_path_lenth == max_depth @@ -209,9 +221,7 @@ def test_node2vec_random_walks(graph_file, directed): print("max_depth is ", max_depth) input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) - path_data, seeds = calc_node2vec_random_walks( - input_graph, max_depth=max_depth - ) + path_data, seeds = calc_node2vec_random_walks(input_graph, max_depth=max_depth) check_node2vec_random_walks(input_graph, path_data, seeds, max_depth) @@ -243,32 +253,28 @@ def test_node2vec_random_walks_nx(graph_file): @pytest.mark.sg @pytest.mark.parametrize("graph_file", SMALL_DATASETS) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_node2vec_random_walks_multi_column_seeds( - graph_file, - directed -): +def test_node2vec_random_walks_multi_column_seeds(graph_file, directed): max_depth = random.randint(2, 10) df_G = graph_file.get_edgelist() - df_G.rename( - columns={"wgt": "weight"}, inplace=True) - df_G['src_0'] = df_G['src'] + 1000 - df_G['dst_0'] = df_G['dst'] + 1000 + df_G.rename(columns={"wgt": "weight"}, inplace=True) + df_G["src_0"] = df_G["src"] + 1000 + df_G["dst_0"] = df_G["dst"] + 1000 if directed: G = cugraph.Graph(directed=True) else: G = cugraph.Graph() - G.from_cudf_edgelist(df_G, source=['src', 'src_0'], - destination=['dst', 'dst_0'], - edge_attr="weight") + G.from_cudf_edgelist( + df_G, source=["src", "src_0"], destination=["dst", "dst_0"], edge_attr="weight" + ) k = random.randint(1, 10) seeds = G.select_random_vertices(num_vertices=k) vertex_paths, edge_weights, vertex_path_sizes = cugraph.node2vec_random_walks( - G, seeds, max_depth) - + G, seeds, max_depth + ) + path_data = (vertex_paths, edge_weights, vertex_path_sizes) - + check_node2vec_random_walks(G, path_data, seeds, max_depth) - \ No newline at end of file diff --git a/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py index ad6b2022e7..a2c2cdc01d 100644 --- a/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_node2vec_random_walks_mg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -19,10 +19,8 @@ import cugraph import dask_cudf import cugraph.dask as dcg -import cudf from cugraph.testing import SMALL_DATASETS from cugraph.datasets import karate_asymmetric -from cugraph.structure.symmetrize import symmetrize from pylibcugraph.testing.utils import gen_fixture_params_product @@ -206,7 +204,10 @@ def test_dask_mg_node2vec_random_walks(dask_client, input_graph): path_data, seeds, max_depth = calc_node2vec_random_walks(input_graph) df_G = input_graph.input_df.compute().reset_index(drop=True) - df_G = input_graph.decompress_to_edgelist( - return_unrenumbered_edgelist=True).compute().reset_index(drop=True) + df_G = ( + input_graph.decompress_to_edgelist(return_unrenumbered_edgelist=True) + .compute() + .reset_index(drop=True) + ) check_node2vec_random_walks(input_graph, path_data, seeds, max_depth, df_G) diff --git a/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py index 033e6760e0..41245d4dfe 100644 --- a/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_random_walks_mg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py index ef2fc00af1..2b66098f05 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks.py @@ -19,10 +19,9 @@ import cudf import cugraph -from cudf.testing import assert_series_equal from cudf.testing.testing import assert_frame_equal from cugraph.utilities import ensure_cugraph_obj_for_nx -from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS, utils +from cugraph.testing import SMALL_DATASETS, DEFAULT_DATASETS # ============================================================================= @@ -74,8 +73,6 @@ def calc_uniform_random_walks(G, max_depth=None): k = random.randint(1, 6) - random_walks_type = "uniform" - start_vertices = G.select_random_vertices(num_vertices=k) print("\nstart_vertices is \n", start_vertices) @@ -86,8 +83,6 @@ def calc_uniform_random_walks(G, max_depth=None): return (vertex_paths, edge_weights, vertex_path_sizes), start_vertices - - def check_uniform_random_walks(G, path_data, seeds, max_depth): invalid_edge = 0 invalid_seeds = 0 @@ -118,33 +113,51 @@ def check_uniform_random_walks(G, path_data, seeds, max_depth): if i % (max_depth + 1) == 0: if isinstance(seeds, cudf.DataFrame): assert_frame_equal( - vertex_1.rename(columns={x:y for x,y in zip(vertex_1.columns,range(0,len(vertex_1.columns)))}), - seeds.iloc[[i // (max_depth + 1)]].reset_index(drop=True).rename(columns={x:y for x,y in zip(seeds.columns,range(0,len(seeds.columns)))}), - check_dtype=False, check_like=True) + vertex_1.rename( + columns={ + x: y + for x, y in zip( + vertex_1.columns, range(0, len(vertex_1.columns)) + ) + } + ), + seeds.iloc[[i // (max_depth + 1)]] + .reset_index(drop=True) + .rename( + columns={ + x: y + for x, y in zip(seeds.columns, range(0, len(seeds.columns))) + } + ), + check_dtype=False, + check_like=True, + ) else: if i % (max_depth + 1) == 0 and vertex_1 != seeds[i // (max_depth + 1)]: invalid_seeds += 1 print( "[ERR] Invalid seed: " - " src {} != src {}".format(vertex_1, seeds[i // (max_depth + 1)]) - ) + " src {} != src {}".format( + vertex_1, seeds[i // (max_depth + 1)] + ) + ) if (i % (max_depth + 1)) != (max_depth): # These are the edges src = vertex_1 dst = vertex_2 - + # check for valid edge. if isinstance(seeds, cudf.DataFrame): - if (-1 not in src.iloc[0].reset_index(drop=True)) and (-1 not in dst.iloc[0].reset_index(drop=True)): + if (-1 not in src.iloc[0].reset_index(drop=True)) and ( + -1 not in dst.iloc[0].reset_index(drop=True) + ): edge = cudf.DataFrame() edge["src"] = vertex_1["0_vertex_paths"] edge["src_0"] = vertex_1["1_vertex_paths"] edge["dst"] = vertex_2["0_vertex_paths"] edge["dst_0"] = vertex_2["1_vertex_paths"] - join1 = cudf.merge(df_G, edge, on=[*edge.columns]) - assert len(cudf.merge(df_G, edge, on=[*edge.columns])) > 0 else: edge = df_G.loc[ @@ -185,7 +198,6 @@ def check_uniform_random_walks(G, path_data, seeds, max_depth): assert invalid_edge_wgt == 0 assert len(e_wgt_paths) == (max_depth) * len(seeds) - max_path_lenth = path_data[2] assert max_path_lenth == max_depth @@ -209,9 +221,7 @@ def test_uniform_random_walks(graph_file, directed): print("max_depth is ", max_depth) input_graph = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) - path_data, seeds = calc_uniform_random_walks( - input_graph, max_depth=max_depth - ) + path_data, seeds = calc_uniform_random_walks(input_graph, max_depth=max_depth) check_uniform_random_walks(input_graph, path_data, seeds, max_depth) @@ -243,32 +253,28 @@ def test_uniform_random_walks_nx(graph_file): @pytest.mark.sg @pytest.mark.parametrize("graph_file", SMALL_DATASETS) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) -def test_uniform_random_walks_multi_column_seeds( - graph_file, - directed -): +def test_uniform_random_walks_multi_column_seeds(graph_file, directed): max_depth = random.randint(2, 10) df_G = graph_file.get_edgelist() - df_G.rename( - columns={"wgt": "weight"}, inplace=True) - df_G['src_0'] = df_G['src'] + 1000 - df_G['dst_0'] = df_G['dst'] + 1000 + df_G.rename(columns={"wgt": "weight"}, inplace=True) + df_G["src_0"] = df_G["src"] + 1000 + df_G["dst_0"] = df_G["dst"] + 1000 if directed: G = cugraph.Graph(directed=True) else: G = cugraph.Graph() - G.from_cudf_edgelist(df_G, source=['src', 'src_0'], - destination=['dst', 'dst_0'], - edge_attr="weight") + G.from_cudf_edgelist( + df_G, source=["src", "src_0"], destination=["dst", "dst_0"], edge_attr="weight" + ) k = random.randint(1, 10) seeds = G.select_random_vertices(num_vertices=k) vertex_paths, edge_weights, vertex_path_sizes = cugraph.uniform_random_walks( - G, seeds, max_depth) - + G, seeds, max_depth + ) + path_data = (vertex_paths, edge_weights, vertex_path_sizes) - + check_uniform_random_walks(G, path_data, seeds, max_depth) - \ No newline at end of file diff --git a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py index 50aeb0ec84..c574927833 100644 --- a/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py +++ b/python/cugraph/cugraph/tests/sampling/test_uniform_random_walks_mg.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -19,10 +19,8 @@ import cugraph import dask_cudf import cugraph.dask as dcg -import cudf from cugraph.testing import SMALL_DATASETS from cugraph.datasets import karate_asymmetric -from cugraph.structure.symmetrize import symmetrize from pylibcugraph.testing.utils import gen_fixture_params_product @@ -206,7 +204,10 @@ def test_dask_mg_uniform_random_walks(dask_client, input_graph): path_data, seeds, max_depth = calc_uniform_random_walks(input_graph) df_G = input_graph.input_df.compute().reset_index(drop=True) - df_G = input_graph.decompress_to_edgelist( - return_unrenumbered_edgelist=True).compute().reset_index(drop=True) + df_G = ( + input_graph.decompress_to_edgelist(return_unrenumbered_edgelist=True) + .compute() + .reset_index(drop=True) + ) check_uniform_random_walks(input_graph, path_data, seeds, max_depth, df_G) From f12bded0589d8d415cc3bafe4579c7e7f0beca22 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 16 Jan 2025 10:44:33 -0800 Subject: [PATCH 58/60] update copyright --- python/pylibcugraph/pylibcugraph/node2vec.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcugraph/pylibcugraph/node2vec.pyx b/python/pylibcugraph/pylibcugraph/node2vec.pyx index 5729dc6e05..322a176b24 100644 --- a/python/pylibcugraph/pylibcugraph/node2vec.pyx +++ b/python/pylibcugraph/pylibcugraph/node2vec.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From b7960b73e9b2f09d437443977d89dd90414e245f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 17 Jan 2025 20:20:07 -0800 Subject: [PATCH 59/60] update pytest ini for random walks --- python/cugraph/pytest.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini index 3aa4cc5680..e1cdd949e7 100644 --- a/python/cugraph/pytest.ini +++ b/python/cugraph/pytest.ini @@ -71,3 +71,6 @@ filterwarnings = ignore:The behavior of array concatenation with empty entries is deprecated:FutureWarning ignore:This method is deprecated and will no longer be supported. The symmetrization:FutureWarning ignore:Support for accepting and returning NetworkX objects is deprecated. Please use NetworkX with the nx-cugraph backend:DeprecationWarning + ignore:node2vec is deprecated and will be removed in the next release in favor of node2vec_random_walks:FutureWarning + ignore:random_walks is deprecated and will be removed in the next release in favor of uniform_random_walks:FutureWarning + ignore:Coalesced path results, returned when setting legacy_result_type=True, is deprecated and will no longer be supported:FutureWarning \ No newline at end of file From ba015c4968687c2bf160b5b3a6675b282117327c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 17 Jan 2025 20:22:49 -0800 Subject: [PATCH 60/60] fix style and update copyright --- python/cugraph/pytest.ini | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cugraph/pytest.ini b/python/cugraph/pytest.ini index e1cdd949e7..335a056df4 100644 --- a/python/cugraph/pytest.ini +++ b/python/cugraph/pytest.ini @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -73,4 +73,4 @@ filterwarnings = ignore:Support for accepting and returning NetworkX objects is deprecated. Please use NetworkX with the nx-cugraph backend:DeprecationWarning ignore:node2vec is deprecated and will be removed in the next release in favor of node2vec_random_walks:FutureWarning ignore:random_walks is deprecated and will be removed in the next release in favor of uniform_random_walks:FutureWarning - ignore:Coalesced path results, returned when setting legacy_result_type=True, is deprecated and will no longer be supported:FutureWarning \ No newline at end of file + ignore:Coalesced path results, returned when setting legacy_result_type=True, is deprecated and will no longer be supported:FutureWarning