From ea0e66d7136aadbdbc292517e20e826286e86df8 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Jun 2024 15:49:13 -0700 Subject: [PATCH 01/33] add plc implementation of all-pairs similarity leveraging the capi --- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 3 + python/pylibcugraph/pylibcugraph/__init__.py | 6 + .../_cugraph_c/similarity_algorithms.pxd | 44 ++++- .../all_pairs_jaccard_coefficients.pyx | 158 ++++++++++++++++++ .../all_pairs_overlap_coefficients.pyx | 158 ++++++++++++++++++ .../all_pairs_sorensen_coefficients.pyx | 158 ++++++++++++++++++ 6 files changed, 526 insertions(+), 1 deletion(-) create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index 7cc90145949..53fbb00f1c1 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -58,6 +58,9 @@ set(cython_sources weakly_connected_components.pyx replicate_edgelist.pyx degrees.pyx + all_pairs_jaccard_coefficients.pyx + all_pairs_sorensen_coefficients.pyx + all_pairs_overlap_coefficients.pyx ) set(linked_libraries cugraph::cugraph;cugraph::cugraph_c) diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index dcdef05e106..99ed3b509e8 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -95,6 +95,12 @@ from pylibcugraph.sorensen_coefficients import sorensen_coefficients +from pylibcugraph.all_pairs_jaccard_coefficients import all_pairs_jaccard_coefficients + +from pylibcugraph.all_pairs_overlap_coefficients import all_pairs_overlap_coefficients + +from pylibcugraph.all_pairs_sorensen_coefficients import all_pairs_sorensen_coefficients + from pylibcugraph.degrees import in_degrees, out_degrees, degrees diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index 406094f18d5..e343b35c069 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -50,7 +50,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_free( cugraph_similarity_result_t* result ) - + ########################################################################### # jaccard coefficients cdef cugraph_error_code_t \ @@ -63,6 +63,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs jaccard coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_jaccard_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) ########################################################################### # sorensen coefficients @@ -76,6 +90,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs sorensen coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_sorensen_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) ########################################################################### # overlap coefficients @@ -89,3 +117,17 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs overlap coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_overlap_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx new file mode 100644 index 00000000000..f4d188ed9e2 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_jaccard_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Jaccard similarity computation. + + Note that Jaccard similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients + + topk : size_t + Specify how many answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Jaccard coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_jaccard_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_jaccard_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx new file mode 100644 index 00000000000..12c163a759e --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_overlap_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def all_pairs_overlap_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check): + """ + Perform All-Pairs overlap similarity computation. + + Note that overlap similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, the compute weighted overlap_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted overlap_coefficients + + topk : size_t + Specify how many answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding overlap coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_overlap_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_overlap_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx new file mode 100644 index 00000000000..be769381be6 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_sorensen_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check): + """ + Perform All-Pairs sorensen similarity computation. + + Note that sorensen similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, the compute weighted sorensen_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted sorensen_coefficients + + topk : size_t + Specify how many answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding sorensen coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_sorensen_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_sorensen_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + + return cupy_first, cupy_second, cupy_similarity From 9250562911891fb783c7d2b0e94a2d775d20a8dd Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 26 Jun 2024 09:22:05 -0700 Subject: [PATCH 02/33] import and use SIZE_MAX --- .../pylibcugraph/all_pairs_jaccard_coefficients.pyx | 8 ++++++-- .../pylibcugraph/all_pairs_overlap_coefficients.pyx | 8 ++++++-- .../pylibcugraph/all_pairs_sorensen_coefficients.pyx | 8 ++++++-- python/pylibcugraph/pylibcugraph/utils.pxd | 3 +++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx index f4d188ed9e2..2c337df4394 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx @@ -55,7 +55,8 @@ from pylibcugraph.graphs cimport ( from pylibcugraph.utils cimport ( assert_success, copy_to_cupy_array, - create_cugraph_type_erased_device_array_view_from_py_obj + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX ) @@ -89,7 +90,7 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, Otherwise, computed un-weighted jaccard_coefficients topk : size_t - Specify how many answers to return otherwise will return all values. + Specify the number of answers to return otherwise will return all values. do_expensive_check : bool @@ -107,6 +108,9 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, """ + if topk is None: + topk = SIZE_MAX + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx index 12c163a759e..901c4185eb7 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx @@ -55,7 +55,8 @@ from pylibcugraph.graphs cimport ( from pylibcugraph.utils cimport ( assert_success, copy_to_cupy_array, - create_cugraph_type_erased_device_array_view_from_py_obj + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX ) @@ -89,7 +90,7 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, Otherwise, computed un-weighted overlap_coefficients topk : size_t - Specify how many answers to return otherwise will return all values. + Specify the number of answers to return otherwise will return all values. do_expensive_check : bool @@ -107,6 +108,9 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, """ + if topk is None: + topk = SIZE_MAX + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx index be769381be6..1a492ab9373 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx @@ -55,7 +55,8 @@ from pylibcugraph.graphs cimport ( from pylibcugraph.utils cimport ( assert_success, copy_to_cupy_array, - create_cugraph_type_erased_device_array_view_from_py_obj + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX ) @@ -89,7 +90,7 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, Otherwise, computed un-weighted sorensen_coefficients topk : size_t - Specify how many answers to return otherwise will return all values. + Specify the number of answers to return otherwise will return all values. do_expensive_check : bool @@ -107,6 +108,9 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, """ + if topk is None: + topk = SIZE_MAX + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index 7fc140e9aed..d4af1e795ae 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -57,3 +57,6 @@ cdef cugraph_type_erased_device_array_view_t* \ cdef create_cupy_array_view_for_device_ptr( cugraph_type_erased_device_array_view_t* device_array_view_ptr, owning_py_object) + +cdef extern from "stdint.h": + size_t SIZE_MAX From f0c9edc7592ad76b6cb32396a2433abfbdb343f0 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 26 Jun 2024 11:00:11 -0700 Subject: [PATCH 03/33] import vertex pairs retrieval function from the result_ptr and update similarity algorithms --- .../_cugraph_c/similarity_algorithms.pxd | 7 +++-- .../all_pairs_jaccard_coefficients.pyx | 24 ++++++++------- .../all_pairs_overlap_coefficients.pyx | 30 ++++++++++--------- .../all_pairs_sorensen_coefficients.pyx | 30 ++++++++++--------- 4 files changed, 50 insertions(+), 41 deletions(-) diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index e343b35c069..ee16b38e95d 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -35,11 +35,14 @@ from pylibcugraph._cugraph_c.graph_functions cimport ( cdef extern from "cugraph_c/similarity_algorithms.h": + ########################################################################### - #""" ctypedef struct cugraph_similarity_result_t: pass - #""" + + cdef cugraph_vertex_pairs_t* \ + cugraph_similarity_result_get_vertex_pairs( + cugraph_similarity_result_t* result); cdef cugraph_type_erased_device_array_view_t* \ cugraph_similarity_result_get_similarity( diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx index 2c337df4394..70e9846bb75 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx @@ -16,7 +16,6 @@ from libc.stdint cimport uintptr_t from libc.stdio cimport printf -from cython.operator cimport dereference from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -35,7 +34,6 @@ from pylibcugraph._cugraph_c.graph_functions cimport ( cugraph_vertex_pairs_get_first, cugraph_vertex_pairs_get_second, cugraph_vertex_pairs_free, - cugraph_create_vertex_pairs ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, @@ -44,6 +42,7 @@ from pylibcugraph._cugraph_c.similarity_algorithms cimport ( cugraph_all_pairs_jaccard_coefficients, cugraph_similarity_result_t, cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, cugraph_similarity_result_free ) from pylibcugraph.resource_handle cimport ( @@ -64,7 +63,7 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, _GPUGraph graph, vertices, bool_t use_weight, - size_t topk, + topk, bool_t do_expensive_check): """ Perform All-Pairs Jaccard similarity computation. @@ -85,9 +84,9 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, on all vertices in the graph. use_weight : bool, optional - If set to True, the compute weighted jaccard_coefficients( + If set to True, then compute weighted jaccard_coefficients( the input graph must be weighted in that case). - Otherwise, computed un-weighted jaccard_coefficients + Otherwise, compute non-weighted jaccard_coefficients topk : size_t Specify the number of answers to return otherwise will return all values. @@ -111,8 +110,6 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, if topk is None: topk = SIZE_MAX - cdef cugraph_vertex_pairs_t* vertex_pairs_ptr - cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ resource_handle.c_resource_handle_ptr cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr @@ -143,20 +140,25 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) - cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) - cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) - cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ cugraph_vertex_pairs_get_second(vertex_pairs_ptr) - cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) # Free all pointers cugraph_similarity_result_free(result_ptr) cugraph_vertex_pairs_free(vertex_pairs_ptr) cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx index 901c4185eb7..95fc99a7dd2 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx @@ -16,7 +16,6 @@ from libc.stdint cimport uintptr_t from libc.stdio cimport printf -from cython.operator cimport dereference from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -35,7 +34,6 @@ from pylibcugraph._cugraph_c.graph_functions cimport ( cugraph_vertex_pairs_get_first, cugraph_vertex_pairs_get_second, cugraph_vertex_pairs_free, - cugraph_create_vertex_pairs ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, @@ -44,6 +42,7 @@ from pylibcugraph._cugraph_c.similarity_algorithms cimport ( cugraph_all_pairs_overlap_coefficients, cugraph_similarity_result_t, cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, cugraph_similarity_result_free ) from pylibcugraph.resource_handle cimport ( @@ -64,12 +63,12 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, _GPUGraph graph, vertices, bool_t use_weight, - size_t topk, + topk, bool_t do_expensive_check): """ - Perform All-Pairs overlap similarity computation. + Perform All-Pairs Overlap similarity computation. - Note that overlap similarity must run on a symmetric graph. + Note that Overlap similarity must run on a symmetric graph. Parameters ---------- @@ -85,9 +84,9 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, on all vertices in the graph. use_weight : bool, optional - If set to True, the compute weighted overlap_coefficients( + If set to True, then compute weighted overlap_coefficients( the input graph must be weighted in that case). - Otherwise, computed un-weighted overlap_coefficients + Otherwise, compute non-weighted overlap_coefficients topk : size_t Specify the number of answers to return otherwise will return all values. @@ -100,7 +99,7 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, Returns ------- A tuple of device arrays containing the vertex pairs with - their corresponding overlap coefficient scores. + their corresponding Overlap coefficient scores. Examples -------- @@ -111,8 +110,6 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, if topk is None: topk = SIZE_MAX - cdef cugraph_vertex_pairs_t* vertex_pairs_ptr - cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ resource_handle.c_resource_handle_ptr cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr @@ -143,20 +140,25 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) - cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) - cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) - cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ cugraph_vertex_pairs_get_second(vertex_pairs_ptr) - cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) # Free all pointers cugraph_similarity_result_free(result_ptr) cugraph_vertex_pairs_free(vertex_pairs_ptr) cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx index 1a492ab9373..c5762271776 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx @@ -16,7 +16,6 @@ from libc.stdint cimport uintptr_t from libc.stdio cimport printf -from cython.operator cimport dereference from pylibcugraph._cugraph_c.resource_handle cimport ( bool_t, @@ -35,7 +34,6 @@ from pylibcugraph._cugraph_c.graph_functions cimport ( cugraph_vertex_pairs_get_first, cugraph_vertex_pairs_get_second, cugraph_vertex_pairs_free, - cugraph_create_vertex_pairs ) from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, @@ -44,6 +42,7 @@ from pylibcugraph._cugraph_c.similarity_algorithms cimport ( cugraph_all_pairs_sorensen_coefficients, cugraph_similarity_result_t, cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, cugraph_similarity_result_free ) from pylibcugraph.resource_handle cimport ( @@ -64,12 +63,12 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, _GPUGraph graph, vertices, bool_t use_weight, - size_t topk, + topk, bool_t do_expensive_check): """ - Perform All-Pairs sorensen similarity computation. + Perform All-Pairs Sorensen similarity computation. - Note that sorensen similarity must run on a symmetric graph. + Note that Sorensen similarity must run on a symmetric graph. Parameters ---------- @@ -85,9 +84,9 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, on all vertices in the graph. use_weight : bool, optional - If set to True, the compute weighted sorensen_coefficients( + If set to True, then compute weighted sorensen_coefficients( the input graph must be weighted in that case). - Otherwise, computed un-weighted sorensen_coefficients + Otherwise, compute non-weighted sorensen_coefficients topk : size_t Specify the number of answers to return otherwise will return all values. @@ -100,7 +99,7 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, Returns ------- A tuple of device arrays containing the vertex pairs with - their corresponding sorensen coefficient scores. + their corresponding Sorensen coefficient scores. Examples -------- @@ -111,8 +110,6 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, if topk is None: topk = SIZE_MAX - cdef cugraph_vertex_pairs_t* vertex_pairs_ptr - cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ resource_handle.c_resource_handle_ptr cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr @@ -143,20 +140,25 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) - cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) - cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) - cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ cugraph_vertex_pairs_get_second(vertex_pairs_ptr) - cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) # Free all pointers cugraph_similarity_result_free(result_ptr) cugraph_vertex_pairs_free(vertex_pairs_ptr) cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' return cupy_first, cupy_second, cupy_similarity From 3dfe9fe37dbf48cebf39ff54b28ed91274cf4466 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 26 Jun 2024 11:16:37 -0700 Subject: [PATCH 04/33] add sg implementation of all pairs similarity algorithms --- python/cugraph/cugraph/__init__.py | 3 + .../cugraph/link_prediction/__init__.py | 3 + .../cugraph/link_prediction/jaccard.py | 122 +++++++++++++++++ .../cugraph/link_prediction/overlap.py | 125 +++++++++++++++++ .../cugraph/link_prediction/sorensen.py | 127 +++++++++++++++++- 5 files changed, 378 insertions(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index ba7e23df800..d094bef027d 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -76,10 +76,13 @@ from cugraph.link_prediction import ( jaccard, jaccard_coefficient, + all_pairs_jaccard, overlap, overlap_coefficient, + all_pairs_overlap, sorensen, sorensen_coefficient, + all_pairs_sorensen ) from cugraph.traversal import ( diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index 38c8b9a2d3b..14954fc8704 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -13,7 +13,10 @@ from cugraph.link_prediction.jaccard import jaccard from cugraph.link_prediction.jaccard import jaccard_coefficient +from cugraph.link_prediction.jaccard import all_pairs_jaccard from cugraph.link_prediction.sorensen import sorensen from cugraph.link_prediction.sorensen import sorensen_coefficient +from cugraph.link_prediction.sorensen import all_pairs_sorensen from cugraph.link_prediction.overlap import overlap from cugraph.link_prediction.overlap import overlap_coefficient +from cugraph.link_prediction.overlap import all_pairs_overlap diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 06644a7e1b7..87585f76d10 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -22,6 +22,7 @@ from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, + all_pairs_jaccard_coefficients as pylibcugraph_all_pairs_jaccard_coefficients, ) from pylibcugraph import ResourceHandle @@ -238,3 +239,124 @@ def jaccard_coefficient( ) return df + +def all_pairs_jaccard( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None +): + """ + Compute the All Pairs Jaccard similarity between all pairs of vertices specified. + All pairs Jaccard similarity is defined between two sets as the ratio of the volume + of their intersection divided by the volume of their union. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Jaccard + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_jaccard, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the jaccard coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the jaccard coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Jaccard weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['jaccard_coeff'] : cudf.Series + The computed Jaccard coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_jaccard + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_jaccard(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + + + first, second, jaccard_coeff = pylibcugraph_all_pairs_jaccard_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, "first", preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, "second", preserve_order=True + ) + + + df = vertex_pair + df["jaccard_coeff"] = cudf.Series(jaccard_coeff) + + return df diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index b6e9cfb58c4..48357900e16 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -22,6 +22,7 @@ from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, + all_pairs_overlap_coefficients as pylibcugraph_all_pairs_overlap_coefficients, ) from pylibcugraph import ResourceHandle @@ -271,3 +272,127 @@ def overlap( df["overlap_coeff"] = cudf.Series(overlap_coeff) return df + + +def all_pairs_overlap( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None +): + """ + Compute the All Pairs Overlap Coefficient between each pair of vertices connected + by an edge, or between arbitrary pairs of vertices specified by the user. + Overlap Coefficient is defined between two sets as the ratio of the volume + of their intersection divided by the smaller of their two volumes. In the + context of graphs, the neighborhood of a vertex is seen as a set. The + Overlap Coefficient weight of each edge represents the strength of + connection between vertices based on the relative similarity of their + neighbors. + + cugraph.all_pairs_overlap, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the overlap coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the overlap coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Overlap weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['overlap_coeff'] : cudf.Series + The computed Overlap coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_overlap + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_overlap(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + + + first, second, overlap_coeff = pylibcugraph_all_pairs_overlap_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, "first", preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, "second", preserve_order=True + ) + + + df = vertex_pair + df["overlap_coeff"] = cudf.Series(overlap_coeff) + + return df \ No newline at end of file diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index cac8bfb9cc6..085e11398fe 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -22,6 +22,7 @@ from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, + all_pairs_sorensen_coefficients as pylibcugraph_all_pairs_sorensen_coefficients, ) from pylibcugraph import ResourceHandle @@ -209,8 +210,8 @@ def sorensen_coefficient( vertices or iterable of 2-tuples (u, v) where u and v are nodes in the graph. - If provided, the Overlap coefficient is computed for the given vertex - pairs. Otherwise, the current implementation computes the overlap + If provided, the Sorensen coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the sorensen coefficient for all adjacent vertices in the graph. do_expensive_check : bool, optional (default=False) @@ -270,3 +271,125 @@ def sorensen_coefficient( ) return df + + +def all_pairs_sorensen( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None +): + """ + Compute All Pairs the Sorensen coefficient between each pair of vertices connected + by an edge, or between arbitrary pairs of vertices specified by the user. + Sorensen coefficient is defined between two sets as the ratio of twice the + volume of their intersection divided by the volume of each set. + If first is specified but second is not, or vice versa, an exception will + be thrown. + + cugraph.all_pairs_sorensen, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the sorensen coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the sorensen coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Sorensen weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['sorensen_coeff'] : cudf.Series + The computed Sorensen coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_sorensen + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_sorensen(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + + + first, second, sorensen_coeff = pylibcugraph_all_pairs_sorensen_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, "first", preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, "second", preserve_order=True + ) + + + df = vertex_pair + df["sorensen_coeff"] = cudf.Series(sorensen_coeff) + + return df From 4a89937d25544a6acf92bae37b8cbbbef9227ee9 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 26 Jun 2024 16:24:45 -0700 Subject: [PATCH 05/33] add mg implementation of all pairs jaccard similarity --- python/cugraph/cugraph/dask/__init__.py | 3 + .../cugraph/dask/link_prediction/jaccard.py | 179 ++++++++++++++++-- .../cugraph/dask/link_prediction/overlap.py | 29 ++- .../cugraph/dask/link_prediction/sorensen.py | 29 ++- 4 files changed, 196 insertions(+), 44 deletions(-) diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index a76f1460575..a7bc2cd968a 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -33,8 +33,11 @@ from .centrality.betweenness_centrality import edge_betweenness_centrality from .cores.k_core import k_core from .link_prediction.jaccard import jaccard +from .link_prediction.jaccard import all_pairs_jaccard from .link_prediction.sorensen import sorensen +#from .link_prediction.sorensen import all_pairs_sorensen from .link_prediction.overlap import overlap +#from .link_prediction.overlap import all_pairs_overlap from .community.leiden import leiden # Avoid "p2p" shuffling in dask for now diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 3b8edc8daa5..9d10f9aa5a3 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -18,10 +18,17 @@ import dask_cudf import cudf from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask import get_n_workers from cugraph.utilities import renumber_vertex_pair +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) + from pylibcugraph import ( jaccard_coefficients as pylibcugraph_jaccard_coefficients, + all_pairs_jaccard_coefficients as pylibcugraph_all_pairs_jaccard_coefficients, ) from pylibcugraph import ResourceHandle @@ -41,6 +48,22 @@ def convert_to_cudf(cp_arrays): return df +def _call_plc_all_pairs_jaccard( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + print("vertices = ", vertices) + print("topk = ", topk) + + return pylibcugraph_all_pairs_jaccard_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + def _call_plc_jaccard( sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name ): @@ -140,21 +163,149 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): do_expensive_check = False - if vertex_pair is not None: - result = [ - client.submit( - _call_plc_jaccard, - Comms.get_session_id(), - input_graph._plc_graph[w], - vertex_pair[w][0], - use_weight, - do_expensive_check, - vertex_pair_col_name, - workers=[w], - allow_other_workers=False, + result = [ + client.submit( + _call_plc_jaccard, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf + + +def all_pairs_jaccard( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None): + """ + Compute the All Pairs Jaccard similarity between all pairs of vertices specified. + All pairs Jaccard similarity is defined between two sets as the ratio of the volume + of their intersection divided by the volume of their union. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Jaccard + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_jaccard, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the jaccard coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the jaccard coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted jaccard (if use_weight==True) + or un-weighted jaccard (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 2 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['jaccard_coeff']: dask_cudf.Series + The computed jaccard coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, + ) + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf( + vertices, npartitions=get_n_workers() ) - for w in Comms.get_workers() - ] + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_jaccard, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] wait(result) diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index 4bda05e3c95..c7a2c2a669d 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -143,21 +143,20 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): do_expensive_check = False - if vertex_pair is not None: - result = [ - client.submit( - _call_plc_overlap, - Comms.get_session_id(), - input_graph._plc_graph[w], - vertex_pair[w][0], - use_weight, - do_expensive_check, - vertex_pair_col_name, - workers=[w], - allow_other_workers=False, - ) - for w in Comms.get_workers() - ] + result = [ + client.submit( + _call_plc_overlap, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] wait(result) diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 163b0d0dc16..80473bd4d65 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -139,21 +139,20 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): do_expensive_check = False - if vertex_pair is not None: - result = [ - client.submit( - _call_plc_sorensen, - Comms.get_session_id(), - input_graph._plc_graph[w], - vertex_pair[w][0], - use_weight, - do_expensive_check, - vertex_pair_col_name, - workers=[w], - allow_other_workers=False, - ) - for w in Comms.get_workers() - ] + result = [ + client.submit( + _call_plc_sorensen, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] wait(result) From 5249f026fa0b5f820f8c2a528e55ed13094cde1d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 2 Jul 2024 17:38:16 -0700 Subject: [PATCH 06/33] add mg tests for all pairs jaccard --- cpp/tests/c_api/mg_similarity_test.c | 63 ++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 4 deletions(-) diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index 587acb4d295..9c229aa23e9 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -26,7 +26,7 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP } similarity_t; +typedef enum { JACCARD, SORENSEN, OVERLAP, ALL_PAIRS_JACCARD, ALL_PAIRS_SORENSEN, ALL_PAIRS_OVERLAP } similarity_t; int generic_similarity_test(const cugraph_resource_handle_t* handle, vertex_t* h_src, @@ -42,6 +42,7 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, bool_t use_weight, similarity_t test_type) { + printf("\nin all-pairs\n"); int test_ret_value = 0; data_type_id_t vertex_tid = INT32; @@ -92,6 +93,9 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, ret_code = cugraph_jaccard_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case ALL_PAIRS_JACCARD: + ret_code = cugraph_all_pairs_jaccard_coefficients( + handle, graph, NULL, use_weight, SIZE_MAX, FALSE, &result, &ret_error); case SORENSEN: ret_code = cugraph_sorensen_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); @@ -106,15 +110,36 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph similarity failed."); cugraph_type_erased_device_array_view_t* similarity_coefficient; + cugraph_vertex_pairs_t* vertex_pairs_; + printf("\nresult tests = %p\n", result); similarity_coefficient = cugraph_similarity_result_get_similarity(result); + vertex_pairs_ = cugraph_similarity_result_get_vertex_pairs(result); + + cugraph_type_erased_device_array_view_t* first_view; + + first_view = cugraph_vertex_pairs_get_first(vertex_pairs_); + /* + cugraph_type_erased_device_array_view_t* second_view = + cugraph_vertex_pairs_get_first(vertex_pairs_); + */ + + + //raft::print_device_vector("similarity_coefficient", similarity_coefficient.data(), similarity_coefficient.size(), std::cout); + weight_t h_similarity_coefficient[num_pairs]; ret_code = cugraph_type_erased_device_array_view_copy_to_host( handle, (byte_t*)h_similarity_coefficient, similarity_coefficient, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); + printf("\ncoefficient = "); + for (int i = 0; i < num_pairs; i++) { + //printf("src = %d, score = %f, ", first_view[i], h_similarity_coefficient[i]); + //printf("src = %d, dst = %d, score = %f, ", first_view[i], second_view[i], h_similarity_coefficient[i]); + } + for (int i = 0; (i < num_pairs) && (test_ret_value == 0); ++i) { TEST_ASSERT(test_ret_value, nearlyEqual(h_similarity_coefficient[i], h_result[i], 0.001), @@ -157,6 +182,35 @@ int test_jaccard(const cugraph_resource_handle_t* handle) JACCARD); } + +int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 22; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; + vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; + weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25, 0.666667, 0.166667, 0.2, 0.666667, 0.3333333, 0.25, 0.666667, 0.5, 0.25}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + ALL_PAIRS_JACCARD); +} + int test_weighted_jaccard(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; @@ -311,9 +365,10 @@ int main(int argc, char** argv) cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle); int result = 0; - result |= RUN_MG_TEST(test_jaccard, handle); - result |= RUN_MG_TEST(test_sorensen, handle); - result |= RUN_MG_TEST(test_overlap, handle); + result |= RUN_MG_TEST(test_all_pairs_jaccard, handle); + // result |= RUN_MG_TEST(test_jaccard, handle); + // result |= RUN_MG_TEST(test_sorensen, handle); + // result |= RUN_MG_TEST(test_overlap, handle); // result |= RUN_MG_TEST(test_weighted_jaccard, handle); // result |= RUN_MG_TEST(test_weighted_sorensen, handle); // result |= RUN_MG_TEST(test_weighted_overlap, handle); From d111e98ed873c0d71ae9cfdb7903552b6bd7fce4 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 3 Jul 2024 19:35:37 -0700 Subject: [PATCH 07/33] unrenumber the vertex_pairs --- cpp/src/c_api/similarity.cpp | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp index aa54fc6dee7..164452b03f3 100644 --- a/cpp/src/c_api/similarity.cpp +++ b/cpp/src/c_api/similarity.cpp @@ -211,6 +211,20 @@ struct all_pairs_similarity_functor : public cugraph::c_api::abstract_functor { vertices_->as_type(), vertices_->size_}) : std::nullopt, topk_ != SIZE_MAX ? std::make_optional(topk_) : std::nullopt); + + cugraph::unrenumber_int_vertices(handle_, + v1.data(), + v1.size(), + number_map->data(), + vertex_partition_range_lasts, + false); + + cugraph::unrenumber_int_vertices(handle_, + v2.data(), + v2.size(), + number_map->data(), + vertex_partition_range_lasts, + false); result_ = new cugraph::c_api::cugraph_similarity_result_t{ new cugraph::c_api::cugraph_type_erased_device_array_t(similarity_coefficients, @@ -458,4 +472,4 @@ extern "C" cugraph_error_code_t cugraph_all_pairs_overlap_coefficients( handle, graph, vertices, overlap_functor{}, use_weight, topk, do_expensive_check); return cugraph::c_api::run_algorithm(graph, functor, result, error); -} +} \ No newline at end of file From 228f3fbff032fc212005fc239bbce3c064dbb35d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 4 Jul 2024 19:07:36 -0700 Subject: [PATCH 08/33] fix typo --- cpp/src/c_api/similarity.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp index 164452b03f3..4de135081ec 100644 --- a/cpp/src/c_api/similarity.cpp +++ b/cpp/src/c_api/similarity.cpp @@ -216,14 +216,14 @@ struct all_pairs_similarity_functor : public cugraph::c_api::abstract_functor { v1.data(), v1.size(), number_map->data(), - vertex_partition_range_lasts, + graph_view.vertex_partition_range_lasts(), false); cugraph::unrenumber_int_vertices(handle_, v2.data(), v2.size(), number_map->data(), - vertex_partition_range_lasts, + graph_view.vertex_partition_range_lasts(), false); result_ = new cugraph::c_api::cugraph_similarity_result_t{ From 2b6c00206120c3469cd6e868c037f0a6e6116207 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 7 Jul 2024 05:42:52 -0700 Subject: [PATCH 09/33] add tests for the MG CAPI all pairs similarity --- cpp/tests/c_api/mg_similarity_test.c | 471 +++++++++++++++++++++++---- 1 file changed, 410 insertions(+), 61 deletions(-) diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index 9c229aa23e9..1d65a161594 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -34,15 +34,17 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, weight_t* h_wgt, vertex_t* h_first, vertex_t* h_second, + vertex_t* h_start_vertices, weight_t* h_result, size_t num_vertices, size_t num_edges, size_t num_pairs, + size_t num_start_vertices, + size_t topk, bool_t store_transposed, bool_t use_weight, similarity_t test_type) { - printf("\nin all-pairs\n"); int test_ret_value = 0; data_type_id_t vertex_tid = INT32; @@ -54,8 +56,10 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, cugraph_vertex_pairs_t* vertex_pairs = NULL; cugraph_type_erased_device_array_t* v1 = NULL; cugraph_type_erased_device_array_t* v2 = NULL; + cugraph_type_erased_device_array_t* start_v = NULL; cugraph_type_erased_device_array_view_t* v1_view = NULL; cugraph_type_erased_device_array_view_t* v2_view = NULL; + cugraph_type_erased_device_array_view_t* start_v_view = NULL; ret_code = create_test_graph( handle, h_src, h_dst, h_wgt, num_edges, store_transposed, FALSE, TRUE, &graph, &ret_error); @@ -63,30 +67,48 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed."); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - if (cugraph_resource_handle_get_rank(handle) != 0) { num_pairs = 0; } + if (topk == 0) { topk = SIZE_MAX;} - ret_code = - cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); + if (cugraph_resource_handle_get_rank(handle) != 0) { num_pairs = 0;} - ret_code = - cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v2, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v2 create failed."); + if (h_first != NULL && h_second != NULL) { - v1_view = cugraph_type_erased_device_array_view(v1); - v2_view = cugraph_type_erased_device_array_view(v2); + ret_code = + cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, v1_view, (byte_t*)h_first, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_first copy_from_host failed."); + ret_code = + cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v2, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v2 create failed."); - ret_code = cugraph_type_erased_device_array_view_copy_from_host( - handle, v2_view, (byte_t*)h_second, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_second copy_from_host failed."); + v1_view = cugraph_type_erased_device_array_view(v1); + v2_view = cugraph_type_erased_device_array_view(v2); - ret_code = - cugraph_create_vertex_pairs(handle, graph, v1_view, v2_view, &vertex_pairs, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed."); + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, v1_view, (byte_t*)h_first, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_first copy_from_host failed."); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, v2_view, (byte_t*)h_second, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_second copy_from_host failed."); + + ret_code = + cugraph_create_vertex_pairs(handle, graph, v1_view, v2_view, &vertex_pairs, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed."); + } + + + if (h_start_vertices != NULL) { + ret_code = + cugraph_type_erased_device_array_create(handle, num_start_vertices, vertex_tid, &start_v, &ret_error); + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); + start_v_view = cugraph_type_erased_device_array_view(start_v); + + ret_code = cugraph_type_erased_device_array_view_copy_from_host( + handle, start_v_view, (byte_t*)h_start_vertices, &ret_error); + + TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_start_vertices copy_from_host failed."); + } switch (test_type) { case JACCARD: @@ -95,38 +117,44 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, break; case ALL_PAIRS_JACCARD: ret_code = cugraph_all_pairs_jaccard_coefficients( - handle, graph, NULL, use_weight, SIZE_MAX, FALSE, &result, &ret_error); + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; case SORENSEN: ret_code = cugraph_sorensen_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case ALL_PAIRS_SORENSEN: + ret_code = cugraph_all_pairs_sorensen_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; case OVERLAP: ret_code = cugraph_overlap_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case ALL_PAIRS_OVERLAP: + ret_code = cugraph_all_pairs_overlap_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "cugraph similarity failed."); cugraph_type_erased_device_array_view_t* similarity_coefficient; - cugraph_vertex_pairs_t* vertex_pairs_; - printf("\nresult tests = %p\n", result); similarity_coefficient = cugraph_similarity_result_get_similarity(result); - vertex_pairs_ = cugraph_similarity_result_get_vertex_pairs(result); - - cugraph_type_erased_device_array_view_t* first_view; - - first_view = cugraph_vertex_pairs_get_first(vertex_pairs_); - /* - cugraph_type_erased_device_array_view_t* second_view = - cugraph_vertex_pairs_get_first(vertex_pairs_); - */ - - - //raft::print_device_vector("similarity_coefficient", similarity_coefficient.data(), similarity_coefficient.size(), std::cout); + switch (test_type) { + case ALL_PAIRS_JACCARD: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + case ALL_PAIRS_SORENSEN: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + case ALL_PAIRS_OVERLAP: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; + } weight_t h_similarity_coefficient[num_pairs]; @@ -134,18 +162,13 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, handle, (byte_t*)h_similarity_coefficient, similarity_coefficient, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "copy_to_host failed."); - printf("\ncoefficient = "); - for (int i = 0; i < num_pairs; i++) { - //printf("src = %d, score = %f, ", first_view[i], h_similarity_coefficient[i]); - //printf("src = %d, dst = %d, score = %f, ", first_view[i], second_view[i], h_similarity_coefficient[i]); - } - for (int i = 0; (i < num_pairs) && (test_ret_value == 0); ++i) { TEST_ASSERT(test_ret_value, nearlyEqual(h_similarity_coefficient[i], h_result[i], 0.001), "similarity results don't match"); } + if (result != NULL) cugraph_similarity_result_free(result); if (vertex_pairs != NULL) cugraph_vertex_pairs_free(vertex_pairs); cugraph_mg_graph_free(graph); @@ -159,12 +182,15 @@ int test_jaccard(const cugraph_resource_handle_t* handle) size_t num_edges = 16; size_t num_vertices = 6; size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.2, 0.666667, 0.333333, 0.4, 0.166667, 0.5, 0.2, 0.25, 0.25, 0.666667}; return generic_similarity_test(handle, @@ -173,27 +199,68 @@ int test_jaccard(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, FALSE, JACCARD); } +int test_weighted_jaccard(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + TRUE, + JACCARD); +} int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; size_t num_vertices = 6; - size_t num_pairs = 22; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; - vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25, 0.666667, 0.166667, 0.2, 0.666667, 0.3333333, 0.25, 0.666667, 0.5, 0.25}; return generic_similarity_test(handle, @@ -202,29 +269,67 @@ int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, FALSE, ALL_PAIRS_JACCARD); } -int test_weighted_jaccard(const cugraph_resource_handle_t* handle) +int test_all_pairs_jaccard_with_start_vertices(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; - vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; - weight_t h_wgt[] = { - 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.357143, 0.208333, 0.0}; + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_JACCARD); +} + +int test_all_pairs_jaccard_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.666667, 0.666667, 0.666667, 0.666667, 0.5}; return generic_similarity_test(handle, h_src, @@ -232,26 +337,34 @@ int test_weighted_jaccard(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, - TRUE, - JACCARD); + FALSE, + ALL_PAIRS_JACCARD); } + + int test_sorensen(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; size_t num_vertices = 6; size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.333333, 0.8, 0.5, 0.571429, 0.285714, 0.666667, 0.333333, 0.4, 0.4, 0.8}; return generic_similarity_test(handle, @@ -260,10 +373,13 @@ int test_sorensen(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, FALSE, SORENSEN); @@ -274,6 +390,8 @@ int test_weighted_sorensen(const cugraph_resource_handle_t* handle) size_t num_edges = 16; size_t num_vertices = 7; size_t num_pairs = 3; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; @@ -282,6 +400,7 @@ int test_weighted_sorensen(const cugraph_resource_handle_t* handle) vertex_t h_first[] = {0, 0, 1}; vertex_t h_second[] = {1, 2, 3}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(handle, @@ -290,26 +409,134 @@ int test_weighted_sorensen(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, TRUE, SORENSEN); } +int test_all_pairs_sorensen(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.333333, 0.4, 0.8, 0.5, 0.333333, 0.571429, 0.285714, 0.666667, 0.4, 0.571429, 0.333333, 0.4, 0.4, 0.8, 0.285714, 0.333333, 0.8, 0.5, 0.4, 0.8, 0.666667, 0.4}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_SORENSEN); +} + +int test_all_pairs_sorensen_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = {0.333333, 0.4, 0.8, 0.5, 0.333333, 0.571429, 0.285714, 0.666667, 0.4, 0.571429, 0.333333, 0.4, 0.4}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_SORENSEN); +} + +int test_all_pairs_sorensen_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.8, 0.8, 0.8, 0.8, 0.666667}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_SORENSEN); +} + int test_overlap(const cugraph_resource_handle_t* handle) { size_t num_edges = 16; size_t num_vertices = 6; size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1}; return generic_similarity_test(handle, @@ -318,10 +545,13 @@ int test_overlap(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, FALSE, OVERLAP); @@ -332,6 +562,8 @@ int test_weighted_overlap(const cugraph_resource_handle_t* handle) size_t num_edges = 16; size_t num_vertices = 7; size_t num_pairs = 3; + size_t num_start_vertices = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; @@ -340,6 +572,7 @@ int test_weighted_overlap(const cugraph_resource_handle_t* handle) vertex_t h_first[] = {0, 0, 1}; vertex_t h_second[] = {1, 2, 3}; + vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(handle, @@ -348,15 +581,121 @@ int test_weighted_overlap(const cugraph_resource_handle_t* handle) h_wgt, h_first, h_second, + h_start_vertices, h_result, num_vertices, num_edges, num_pairs, + num_start_vertices, + topk, FALSE, TRUE, OVERLAP); } +int test_all_pairs_overlap(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_OVERLAP); +} + +int test_all_pairs_overlap_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_OVERLAP); +} + +int test_all_pairs_overlap_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_OVERLAP); +} + + /******************************************************************************/ int main(int argc, char** argv) @@ -365,13 +704,23 @@ int main(int argc, char** argv) cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle); int result = 0; + result |= RUN_MG_TEST(test_jaccard, handle); + result |= RUN_MG_TEST(test_weighted_jaccard, handle); result |= RUN_MG_TEST(test_all_pairs_jaccard, handle); - // result |= RUN_MG_TEST(test_jaccard, handle); - // result |= RUN_MG_TEST(test_sorensen, handle); - // result |= RUN_MG_TEST(test_overlap, handle); - // result |= RUN_MG_TEST(test_weighted_jaccard, handle); - // result |= RUN_MG_TEST(test_weighted_sorensen, handle); - // result |= RUN_MG_TEST(test_weighted_overlap, handle); + result |= RUN_MG_TEST(test_all_pairs_jaccard_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_jaccard_with_topk, handle); + + result |= RUN_MG_TEST(test_sorensen, handle); + result |= RUN_MG_TEST(test_weighted_sorensen, handle); + result |= RUN_MG_TEST(test_all_pairs_sorensen, handle); + result |= RUN_MG_TEST(test_all_pairs_sorensen_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_sorensen_with_topk, handle); + + result |= RUN_MG_TEST(test_overlap, handle); + result |= RUN_MG_TEST(test_weighted_overlap, handle); + result |= RUN_MG_TEST(test_all_pairs_overlap, handle); + result |= RUN_MG_TEST(test_all_pairs_overlap_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_overlap_with_topk, handle); cugraph_free_resource_handle(handle); free_mg_raft_handle(raft_handle); From 42cb71506eaa0542ec2307431065068ca604c6ed Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 01:39:54 -0700 Subject: [PATCH 10/33] add tests for all pairs similarity --- .../tests/link_prediction/test_jaccard.py | 66 ++++++++++++++++++- .../tests/link_prediction/test_overlap.py | 66 ++++++++++++++++++- .../tests/link_prediction/test_sorensen.py | 66 ++++++++++++++++++- 3 files changed, 195 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index 3691ad5a8c9..1f5e811f291 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -22,7 +22,7 @@ import cugraph from cugraph.datasets import netscience from cugraph.testing import utils, UNDIRECTED_DATASETS -from cudf.testing import assert_series_equal +from cudf.testing import assert_series_equal, assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -341,3 +341,67 @@ def test_weighted_jaccard(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.jaccard(G, use_weight=True) + + +@pytest.mark.sg +def test_all_pairs_jaccard(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Jaccard + jaccard_results = cugraph.jaccard(G) + + # Remove self loop + jaccard_results = jaccard_results[jaccard_results['first'] != jaccard_results['second']].reset_index(drop=True) + + all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G) + + assert_frame_equal(jaccard_results.head(), all_pairs_jaccard_results.head(), check_dtype=False, check_like=True) + + +# FIXME +@pytest.mark.sg +@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity") +def test_all_pairs_jaccard_with_vertices(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Jaccard + jaccard_results = cugraph.jaccard(G) + + # Remove self loop + jaccard_results = jaccard_results[jaccard_results['first'] != jaccard_results['second']].reset_index(drop=True) + + vertices = [0, 1, 2] + + mask_first = jaccard_results['first'].isin(vertices) + mask_second = jaccard_results['second'].isin(vertices) + # mask = [v in vertices for v in (jaccard_results['first'].to_pandas() or jaccard_results['second'].to_pandas())] + mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] + + jaccard_results = jaccard_results[mask].reset_index(drop=True) + + # Call all-pairs Jaccard + all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G, vertices=cudf.Series(vertices, dtype="int32")) + + assert_frame_equal(jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True) + + +@pytest.mark.sg +def test_all_pairs_jaccard_with_topk(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Jaccard + jaccard_results = cugraph.jaccard(G) + + topk = 4 + + # Remove self loop + jaccard_results = jaccard_results[jaccard_results['first'] != jaccard_results['second']].\ + sort_values(["jaccard_coeff", "first", "second"], ascending=False).reset_index(drop=True)[:topk] + + # Call all-pairs Jaccard + all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G, topk=topk).sort_values(["first", "second"], ascending=False).reset_index(drop=True) + + assert_frame_equal(jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 4b00330b6c9..b864f0dbbdf 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -20,7 +20,7 @@ import cudf import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS -from cudf.testing import assert_series_equal +from cudf.testing import assert_series_equal, assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -242,3 +242,67 @@ def test_weighted_overlap(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.overlap(G, use_weight=True) + + +@pytest.mark.sg +def test_all_pairs_overlap(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Overlap + overlap_results = cugraph.overlap(G) + + # Remove self loop + overlap_results = overlap_results[overlap_results['first'] != overlap_results['second']].reset_index(drop=True) + + all_pairs_overlap_results = cugraph.all_pairs_overlap(G) + + assert_frame_equal(overlap_results.head(), all_pairs_overlap_results.head(), check_dtype=False, check_like=True) + + +# FIXME +@pytest.mark.sg +@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity") +def test_all_pairs_overlap_with_vertices(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Overlap + overlap_results = cugraph.overlap(G) + + # Remove self loop + overlap_results = overlap_results[overlap_results['first'] != overlap_results['second']].reset_index(drop=True) + + vertices = [0, 1, 2] + + mask_first = overlap_results['first'].isin(vertices) + mask_second = overlap_results['second'].isin(vertices) + # mask = [v in vertices for v in (overlap_results['first'].to_pandas() or overlap_results['second'].to_pandas())] + mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] + + overlap_results = overlap_results[mask].reset_index(drop=True) + + # Call all-pairs Overlap + all_pairs_overlap_results = cugraph.all_pairs_overlap(G, vertices=cudf.Series(vertices, dtype="int32")) + + assert_frame_equal(overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True) + + +@pytest.mark.sg +def test_all_pairs_overlap_with_topk(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Overlap + overlap_results = cugraph.overlap(G) + + topk = 4 + + # Remove self loop + overlap_results = overlap_results[overlap_results['first'] != overlap_results['second']].\ + sort_values(["overlap_coeff", "first", "second"], ascending=False).reset_index(drop=True)[:topk] + + # Call all-pairs overlap + all_pairs_overlap_results = cugraph.all_pairs_overlap(G, topk=topk).sort_values(["first", "second"], ascending=False).reset_index(drop=True) + + assert_frame_equal(overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 6345187a376..0c10fe08b90 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -20,7 +20,7 @@ import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cugraph.datasets import netscience -from cudf.testing import assert_series_equal +from cudf.testing import assert_series_equal, assert_frame_equal SRC_COL = "0" DST_COL = "1" @@ -337,3 +337,67 @@ def test_weighted_sorensen(): G = karate.get_graph(ignore_weights=True) with pytest.raises(ValueError): cugraph.sorensen(G, use_weight=True) + + +@pytest.mark.sg +def test_all_pairs_sorensen(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Sorensen + sorensen_results = cugraph.sorensen(G) + + # Remove self loop + sorensen_results = sorensen_results[sorensen_results['first'] != sorensen_results['second']].reset_index(drop=True) + + all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G) + + assert_frame_equal(sorensen_results.head(), all_pairs_sorensen_results.head(), check_dtype=False, check_like=True) + + +# FIXME +@pytest.mark.sg +@pytest.mark.skip(reason="Inaccurate results returned by all-pairs similarity") +def test_all_pairs_sorensen_with_vertices(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Sorensen + sorensen_results = cugraph.sorensen(G) + + # Remove self loop + sorensen_results = sorensen_results[sorensen_results['first'] != sorensen_results['second']].reset_index(drop=True) + + vertices = [0, 1, 2] + + mask_first = sorensen_results['first'].isin(vertices) + mask_second = sorensen_results['second'].isin(vertices) + # mask = [v in vertices for v in (sorensen_results['first'].to_pandas() or sorensen_results['second'].to_pandas())] + mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] + + sorensen_results = sorensen_results[mask].reset_index(drop=True) + + # Call all-pairs Sorensen + all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G, vertices=cudf.Series(vertices, dtype="int32")) + + assert_frame_equal(sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True) + + +@pytest.mark.sg +def test_all_pairs_sorensen_with_topk(): + karate = UNDIRECTED_DATASETS[0] + G = karate.get_graph(ignore_weights=True) + + # Call Sorensen + sorensen_results = cugraph.sorensen(G) + + topk = 4 + + # Remove self loop + sorensen_results = sorensen_results[sorensen_results['first'] != sorensen_results['second']].\ + sort_values(["sorensen_coeff", "first", "second"], ascending=False).reset_index(drop=True)[:topk] + + # Call all-pairs sorensen + all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G, topk=topk).sort_values(["first", "second"], ascending=False).reset_index(drop=True) + + assert_frame_equal(sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True) From 7b893e3d43143c9f3035d351db5984dfb9c1a1c0 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 02:37:40 -0700 Subject: [PATCH 11/33] add mg tests for all pairs similarity --- .../tests/link_prediction/test_jaccard_mg.py | 115 +++++++++++++++++- .../tests/link_prediction/test_overlap_mg.py | 112 ++++++++++++++++- .../tests/link_prediction/test_sorensen_mg.py | 72 ++++++++++- 3 files changed, 291 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py index 98f64906564..063a9aa00b0 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py @@ -33,8 +33,10 @@ def setup_function(): IS_DIRECTED = [False] -HAS_VERTEX_PAIR = [True, False] -IS_WEIGHTED = [True, False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] # ============================================================================= @@ -49,6 +51,8 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), (IS_WEIGHTED, "is_weighted"), ) @@ -60,7 +64,7 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) ) return parameters @@ -123,6 +127,70 @@ def input_expected_output(input_combo): return input_combo +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Jaccard algo. + (based on cuGraph Jaccard) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + + if has_topk: + topk = 5 + else: + topk = None + + input_combo["vertices"] = vertices + print("vertices ", vertices, " is_weighted = ", is_weighted) + input_combo["topk"] = topk + sg_cugraph_all_pairs_jaccard = cugraph.all_pairs_jaccard( + G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_jaccard + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + # ============================================================================= # Tests # ============================================================================= @@ -164,3 +232,44 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output): assert len(jaccard_coeff_diffs1) == 0 assert len(jaccard_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_jaccard(dask_client, benchmark, input_expected_output_all_pairs): + + dg = input_expected_output_all_pairs["MGGraph"] + + + use_weight = input_expected_output_all_pairs["is_weighted"] + + + result_jaccard = benchmark( + dcg.all_pairs_jaccard, dg, vertices=input_expected_output_all_pairs["vertices"], topk=input_expected_output_all_pairs["topk"], use_weight=use_weight + ) + + result_jaccard = ( + result_jaccard.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"jaccard_coeff": "mg_cugraph_jaccard_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Jaccard results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_jaccard["sg_cugraph_jaccard_coeff"] = expected_output["jaccard_coeff"] + + jaccard_coeff_diffs1 = result_jaccard.query( + "mg_cugraph_jaccard_coeff - sg_cugraph_jaccard_coeff > 0.00001" + ) + jaccard_coeff_diffs2 = result_jaccard.query( + "mg_cugraph_jaccard_coeff - sg_cugraph_jaccard_coeff < -0.00001" + ) + + assert len(jaccard_coeff_diffs1) == 0 + assert len(jaccard_coeff_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py index 9afe7dd842f..77aabea868b 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py @@ -33,8 +33,10 @@ def setup_function(): IS_DIRECTED = [False] -HAS_VERTEX_PAIR = [True, False] -IS_WEIGHTED = [True, False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] # ============================================================================= @@ -49,6 +51,8 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), (IS_WEIGHTED, "is_weighted"), ) @@ -123,6 +127,69 @@ def input_expected_output(input_combo): return input_combo +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Overlap algo. + (based on cuGraph Overlap) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + + if has_topk: + topk = 5 + else: + topk = None + + input_combo["vertices"] = vertices + input_combo["topk"] = topk + sg_cugraph_all_pairs_overlap = cugraph.all_pairs_overlap( + G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_overlap + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + # ============================================================================= # Tests # ============================================================================= @@ -167,3 +234,44 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output): assert len(overlap_coeff_diffs1) == 0 assert len(overlap_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_overlap(dask_client, benchmark, input_expected_output_all_pairs): + + dg = input_expected_output_all_pairs["MGGraph"] + + + use_weight = input_expected_output_all_pairs["is_weighted"] + + + result_overlap = benchmark( + dcg.all_pairs_overlap, dg, vertices=input_expected_output_all_pairs["vertices"], topk=input_expected_output_all_pairs["topk"], use_weight=use_weight + ) + + result_overlap = ( + result_overlap.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"overlap_coeff": "mg_cugraph_overlap_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Overlap results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_overlap["sg_cugraph_overlap_coeff"] = expected_output["overlap_coeff"] + + overlap_coeff_diffs1 = result_overlap.query( + "mg_cugraph_overlap_coeff - sg_cugraph_overlap_coeff > 0.00001" + ) + overlap_coeff_diffs2 = result_overlap.query( + "mg_cugraph_overlap_coeff - sg_cugraph_overlap_coeff < -0.00001" + ) + + assert len(overlap_coeff_diffs1) == 0 + assert len(overlap_coeff_diffs2) == 0 diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py index 6c24fa5af13..ac39ed1cbc6 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py @@ -34,8 +34,10 @@ def setup_function(): IS_DIRECTED = [False] -HAS_VERTEX_PAIR = [True, False] -IS_WEIGHTED = [True, False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] # ============================================================================= @@ -61,7 +63,7 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) ) return parameters @@ -124,6 +126,70 @@ def input_expected_output(input_combo): return input_combo +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Sorensen algo. + (based on cuGraph Sorensen) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + + if has_topk: + topk = 5 + else: + topk = None + + input_combo["vertices"] = vertices + print("vertices ", vertices, " is_weighted = ", is_weighted) + input_combo["topk"] = topk + sg_cugraph_all_pairs_sorensen = cugraph.all_pairs_sorensen( + G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_sorensen + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + # ============================================================================= # Tests # ============================================================================= From 3fcee12867040f59094d0937ebbf9f4d66e1e5f4 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 02:49:36 -0700 Subject: [PATCH 12/33] add CAPI for cosine similarity --- cpp/src/c_api/similarity.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp index 4de135081ec..9b5a6a8c885 100644 --- a/cpp/src/c_api/similarity.cpp +++ b/cpp/src/c_api/similarity.cpp @@ -288,6 +288,32 @@ struct sorensen_functor { } }; +struct cosine_functor { + template + rmm::device_uvector operator()( + raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs) + { + return cugraph::cosine_similarity_coefficients(handle, graph_view, edge_weight_view, vertex_pairs); + } + + template + std::tuple, + rmm::device_uvector, + rmm::device_uvector> + operator()(raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk) + { + return cugraph::cosine_similarity_all_pairs_coefficients( + handle, graph_view, edge_weight_view, vertices, topk); + } +}; + struct overlap_functor { template rmm::device_uvector operator()( From 18ba600ee85dce4ee91459095af1560f85b1834d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 03:01:15 -0700 Subject: [PATCH 13/33] add plc API for cosine similarity --- python/pylibcugraph/pylibcugraph/__init__.py | 4 + .../_cugraph_c/similarity_algorithms.pxd | 27 +++ .../all_pairs_cosine_coefficients.pyx | 164 +++++++++++++++++ .../pylibcugraph/cosine_coefficients.pyx | 171 ++++++++++++++++++ 4 files changed, 366 insertions(+) create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index 99ed3b509e8..b67acc8bbfc 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -95,12 +95,16 @@ from pylibcugraph.sorensen_coefficients import sorensen_coefficients +from pylibcugraph.cosine_coefficients import cosine_coefficients + from pylibcugraph.all_pairs_jaccard_coefficients import all_pairs_jaccard_coefficients from pylibcugraph.all_pairs_overlap_coefficients import all_pairs_overlap_coefficients from pylibcugraph.all_pairs_sorensen_coefficients import all_pairs_sorensen_coefficients +from pylibcugraph.all_pairs_cosine_coefficients import all_pairs_cosine_coefficients + from pylibcugraph.degrees import in_degrees, out_degrees, degrees diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index ee16b38e95d..061b7138c84 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -134,3 +134,30 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # cosine coefficients + cdef cugraph_error_code_t \ + cugraph_cosine_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) + + ########################################################################### + # all-pairs cosine coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_cosine_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx new file mode 100644 index 00000000000..2ced2cc127b --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx @@ -0,0 +1,164 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_cosine_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_get_vertex_pairs, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj, + SIZE_MAX +) + + +def all_pairs_cosine_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Cosine similarity computation. + + Note that Cosine similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, then compute weighted cosine_coefficients( + the input graph must be weighted in that case). + Otherwise, compute non-weighted cosine_coefficients + + topk : size_t + Specify the number of answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Cosine coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + if topk is None: + topk = SIZE_MAX + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_cosine_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_cosine_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ + cugraph_similarity_result_get_vertex_pairs(result_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_view_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_view_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_view_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + # No need to free 'first_view_ptr' and 'second_view_ptr' as their memory + # are already deallocated when freeing 'result_ptr' + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx new file mode 100644 index 00000000000..b5392e1e7e6 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx @@ -0,0 +1,171 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_cosine_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def cosine_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + first, + second, + bool_t use_weight, + bool_t do_expensive_check): + """ + Compute the Cosine coefficients for the specified vertex_pairs. + + Note that Cosine similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + first : + Source of the vertex pair. + + second : + Destination of the vertex pair. + + use_weight : bool, optional + If set to True, the compute weighted cosine_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted cosine_coefficients + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Cosine coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + # 'first' is a required parameter + cdef cugraph_type_erased_device_array_view_t* \ + first_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + first) + + # 'second' is a required parameter + cdef cugraph_type_erased_device_array_view_t* \ + second_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + second) + + error_code = cugraph_create_vertex_pairs(c_resource_handle_ptr, + c_graph_ptr, + first_view_ptr, + second_view_ptr, + &vertex_pairs_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "vertex_pairs") + + error_code = cugraph_cosine_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertex_pairs_ptr, + use_weight, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_cosine_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(first_view_ptr) + cugraph_type_erased_device_array_view_free(second_view_ptr) + + return cupy_first, cupy_second, cupy_similarity From 2b32679e803027f233c852c6c2ec77ec1c0a647b Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 03:55:00 -0700 Subject: [PATCH 14/33] add all pairs sorensen, overlap and consine similarity to the python API --- python/cugraph/cugraph/dask/__init__.py | 6 +- .../cugraph/dask/link_prediction/cosine.py | 326 ++++++++++++++++++ .../cugraph/dask/link_prediction/jaccard.py | 2 - .../cugraph/dask/link_prediction/overlap.py | 151 +++++++- .../cugraph/dask/link_prediction/sorensen.py | 151 +++++++- 5 files changed, 630 insertions(+), 6 deletions(-) create mode 100644 python/cugraph/cugraph/dask/link_prediction/cosine.py diff --git a/python/cugraph/cugraph/dask/__init__.py b/python/cugraph/cugraph/dask/__init__.py index a7bc2cd968a..6d86982142b 100644 --- a/python/cugraph/cugraph/dask/__init__.py +++ b/python/cugraph/cugraph/dask/__init__.py @@ -35,9 +35,11 @@ from .link_prediction.jaccard import jaccard from .link_prediction.jaccard import all_pairs_jaccard from .link_prediction.sorensen import sorensen -#from .link_prediction.sorensen import all_pairs_sorensen +from .link_prediction.sorensen import all_pairs_sorensen from .link_prediction.overlap import overlap -#from .link_prediction.overlap import all_pairs_overlap +from .link_prediction.overlap import all_pairs_overlap +from .link_prediction.cosine import cosine +from .link_prediction.cosine import all_pairs_cosine from .community.leiden import leiden # Avoid "p2p" shuffling in dask for now diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py new file mode 100644 index 00000000000..4bd341d00fc --- /dev/null +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -0,0 +1,326 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from dask.distributed import wait, default_client +import cugraph.dask.comms.comms as Comms +import dask_cudf +import cudf +from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask import get_n_workers +from cugraph.utilities import renumber_vertex_pair +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) + + +from pylibcugraph import ( + cosine_coefficients as pylibcugraph_cosine_coefficients, + all_pairs_cosine_coefficients as pylibcugraph_all_pairs_cosine_coefficients, +) +from pylibcugraph import ResourceHandle + + +def convert_to_cudf(cp_arrays): + """ + Creates a cudf DataFrame from cupy arrays from pylibcugraph wrapper + """ + + cupy_first, cupy_second, cupy_similarity = cp_arrays + + df = cudf.DataFrame() + df["first"] = cupy_first + df["second"] = cupy_second + df["cosine_coeff"] = cupy_similarity + + return df + + +def _call_plc_all_pairs_cosine( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + print("vertices = ", vertices) + print("topk = ", topk) + + return pylibcugraph_all_pairs_cosine_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + +def _call_plc_cosine( + sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name +): + + first = vertex_pair[vertex_pair_col_name[0]] + second = vertex_pair[vertex_pair_col_name[1]] + + return pylibcugraph_cosine_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=do_expensive_check, + ) + + +def cosine(input_graph, vertex_pair=None, use_weight=False): + """ + Compute the Cosine similarity between each pair of vertices connected by + an edge, or between arbitrary pairs of vertices specified by the user. + Cosine similarity is defined between two sets as the ratio of the volume + of their intersection divided by the volume of their union. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Cosine + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.dask.cosine, in the absence of a specified vertex pair list, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, unweighted Graph. + + vertex_pair : cudf.DataFrame, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices. If provided, the cosine coefficient is computed for the + given vertex pairs. If the vertex_pair is not provided then the + current implementation computes the cosine coefficient for all + adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 2 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['cosine_coeff']: dask_cudf.Series + The computed cosine coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + vertex_pair_col_name = vertex_pair.columns + + if isinstance(vertex_pair, (dask_cudf.DataFrame, cudf.DataFrame)): + vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + + elif vertex_pair is not None: + raise ValueError("vertex_pair must be a dask_cudf or cudf dataframe") + + if not isinstance(vertex_pair, (dask_cudf.DataFrame)): + vertex_pair = dask_cudf.from_cudf( + vertex_pair, npartitions=len(Comms.get_workers()) + ) + vertex_pair = get_distributed_data(vertex_pair) + wait(vertex_pair) + vertex_pair = vertex_pair.worker_to_parts + + # Initialize dask client + client = default_client() + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_cosine, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertex_pair[w][0], + use_weight, + do_expensive_check, + vertex_pair_col_name, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf + + +def all_pairs_cosine( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None): + """ + Compute the All Pairs Cosine similarity between all pairs of vertices specified. + All pairs Cosine similarity is defined between two sets as the ratio of the volume + of their intersection divided by the volume of their union. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Cosine + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_cosine, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the cosine coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 2 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['cosine_coeff']: dask_cudf.Series + The computed cosine coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, + ) + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf( + vertices, npartitions=get_n_workers() + ) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_cosine, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 9d10f9aa5a3..85c2edab2cf 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -51,8 +51,6 @@ def convert_to_cudf(cp_arrays): def _call_plc_all_pairs_jaccard( sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check ): - print("vertices = ", vertices) - print("topk = ", topk) return pylibcugraph_all_pairs_jaccard_coefficients( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index c7a2c2a669d..202d148937d 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -19,9 +19,15 @@ import cudf from cugraph.dask.common.input_utils import get_distributed_data from cugraph.utilities import renumber_vertex_pair +from cugraph.dask import get_n_workers +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) from pylibcugraph import ( overlap_coefficients as pylibcugraph_overlap_coefficients, + all_pairs_overlap_coefficients as pylibcugraph_all_pairs_overlap_coefficients, ) from pylibcugraph import ResourceHandle @@ -41,6 +47,20 @@ def convert_to_cudf(cp_arrays): return df +def _call_plc_all_pairs_overlap( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + + return pylibcugraph_all_pairs_overlap_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + def _call_plc_overlap( sID, mg_graph_x, vertex_pair, use_weight, do_expensive_check, vertex_pair_col_name ): @@ -175,3 +195,132 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): ddf = input_graph.unrenumber(ddf, "second") return ddf + + +def all_pairs_overlap( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None): + """ + Compute the All Pairs Overlap similarity between all pairs of vertices specified. + All pairs Overlap Coefficient is defined between two sets as the ratio of the volume + of their intersection divided by the smaller of their two volumes. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Overlap + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_overlap, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the overlap coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the overlap coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted overlap (if use_weight==True) + or un-weighted overlap (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 2 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['overlap_coeff']: dask_cudf.Series + The computed overlap coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, + ) + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf( + vertices, npartitions=get_n_workers() + ) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_overlap, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf \ No newline at end of file diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 80473bd4d65..728903327bb 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,10 +18,16 @@ import dask_cudf import cudf from cugraph.dask.common.input_utils import get_distributed_data +from cugraph.dask import get_n_workers from cugraph.utilities import renumber_vertex_pair +from cugraph.dask.common.part_utils import ( + get_persisted_df_worker_map, + persist_dask_df_equal_parts_per_worker, +) from pylibcugraph import ( sorensen_coefficients as pylibcugraph_sorensen_coefficients, + all_pairs_sorensen_coefficients as pylibcugraph_all_pairs_sorensen_coefficients, ) from pylibcugraph import ResourceHandle @@ -58,6 +64,20 @@ def _call_plc_sorensen( ) +def _call_plc_all_pairs_sorensen( + sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check +): + + return pylibcugraph_all_pairs_sorensen_coefficients( + resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), + graph=mg_graph_x, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=do_expensive_check, + ) + + def sorensen(input_graph, vertex_pair=None, use_weight=False): """ Compute the Sorensen coefficient between each pair of vertices connected by @@ -171,3 +191,132 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): ddf = input_graph.unrenumber(ddf, "second") return ddf + + +def all_pairs_sorensen( + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None): + """ + Compute the All Pairs Sorensen similarity between all pairs of vertices specified. + All pairs Sorensen coefficient is defined between two sets as the ratio of twice the + volume of their intersection divided by the volume of each set. In the context + of graphs, the neighborhood of a vertex is seen as a set. The Sorensen + similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_sorensen, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the sorensen coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list (edge weights are not supported yet for this algorithm). The + graph should be undirected where an undirected edge is represented by a + directed edge in both direction. The adjacency list will be computed if + not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series, dask_cudf.Series, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the sorensen coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted sorensen (if use_weight==True) + or un-weighted sorensen (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + result : dask_cudf.DataFrame + GPU distributed data frame containing 2 dask_cudf.Series + + ddf['first']: dask_cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + ddf['second']: dask_cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + ddf['sorensen_coeff']: dask_cudf.Series + The computed sorensen coefficient between the first and the second + vertex ID. + """ + + if input_graph.is_directed(): + raise ValueError("input graph must be undirected") + + # Initialize dask client + client = default_client() + + if vertices is not None: + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, + dtype=input_graph.edgelist.edgelist_df[ + input_graph.renumber_map.renumbered_src_col_name + ].dtype, + ) + + if not isinstance(vertices, (dask_cudf.Series)): + vertices = dask_cudf.from_cudf( + vertices, npartitions=get_n_workers() + ) + + if input_graph.renumbered: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + n_workers = get_n_workers() + vertices = vertices.repartition(npartitions=n_workers) + vertices = persist_dask_df_equal_parts_per_worker(vertices, client) + vertices = get_persisted_df_worker_map(vertices, client) + + do_expensive_check = False + + result = [ + client.submit( + _call_plc_all_pairs_sorensen, + Comms.get_session_id(), + input_graph._plc_graph[w], + vertices[w][0] if vertices is not None else None, + use_weight, + topk, + do_expensive_check, + workers=[w], + allow_other_workers=False, + ) + for w in Comms.get_workers() + ] + + wait(result) + + cudf_result = [client.submit(convert_to_cudf, cp_arrays) for cp_arrays in result] + + wait(cudf_result) + + ddf = dask_cudf.from_delayed(cudf_result).persist() + wait(ddf) + + # Wait until the inactive futures are released + wait([(r.release(), c_r.release()) for r, c_r in zip(result, cudf_result)]) + + if input_graph.renumbered: + ddf = input_graph.unrenumber(ddf, "first") + ddf = input_graph.unrenumber(ddf, "second") + + return ddf From dfb1aad2dc924c6ae3b3a526e8bb09dd0d822860 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 03:56:35 -0700 Subject: [PATCH 15/33] add all pairs cosine similarity --- .../cugraph/link_prediction/__init__.py | 3 + .../cugraph/cugraph/link_prediction/cosine.py | 354 ++++++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 python/cugraph/cugraph/link_prediction/cosine.py diff --git a/python/cugraph/cugraph/link_prediction/__init__.py b/python/cugraph/cugraph/link_prediction/__init__.py index 14954fc8704..f511b95c34c 100644 --- a/python/cugraph/cugraph/link_prediction/__init__.py +++ b/python/cugraph/cugraph/link_prediction/__init__.py @@ -20,3 +20,6 @@ from cugraph.link_prediction.overlap import overlap from cugraph.link_prediction.overlap import overlap_coefficient from cugraph.link_prediction.overlap import all_pairs_overlap +from cugraph.link_prediction.cosine import cosine +from cugraph.link_prediction.cosine import cosine_coefficient +from cugraph.link_prediction.cosine import all_pairs_cosine diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py new file mode 100644 index 00000000000..297fa15d336 --- /dev/null +++ b/python/cugraph/cugraph/link_prediction/cosine.py @@ -0,0 +1,354 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from cugraph.utilities import ( + ensure_cugraph_obj_for_nx, + df_edge_score_to_dictionary, + renumber_vertex_pair, +) +import cudf +import warnings +from typing import Union, Iterable + +from pylibcugraph import ( + cosine_coefficients as pylibcugraph_cosine_coefficients, + all_pairs_cosine_coefficients as pylibcugraph_all_pairs_cosine_coefficients, +) +from pylibcugraph import ResourceHandle + +from cugraph.structure import Graph +from cugraph.utilities.utils import import_optional + +# FIXME: the networkx.Graph type used in type annotations is specified +# using a string literal to avoid depending on and importing networkx. +# Instead, networkx is imported optionally, which may cause a problem +# for a type checker if run in an environment where networkx is not installed. +networkx = import_optional("networkx") + + +# FIXME: Move this function to the utility module so that it can be +# shared by other algos +def ensure_valid_dtype(input_graph, vertex_pair): + vertex_dtype = input_graph.edgelist.edgelist_df.dtypes.iloc[0] + vertex_pair_dtypes = vertex_pair.dtypes + + if ( + vertex_pair_dtypes.iloc[0] != vertex_dtype + or vertex_pair_dtypes.iloc[1] != vertex_dtype + ): + warning_msg = ( + "Cosine requires 'vertex_pair' to match the graph's 'vertex' type. " + f"input graph's vertex type is: {vertex_dtype} and got " + f"'vertex_pair' of type: {vertex_pair_dtypes}." + ) + warnings.warn(warning_msg, UserWarning) + vertex_pair = vertex_pair.astype(vertex_dtype) + + return vertex_pair + + +def cosine( + input_graph: Graph, + vertex_pair: cudf.DataFrame = None, + use_weight: bool = False, +): + """ + Compute the Cosine similarity between each pair of vertices connected by + an edge, or between arbitrary pairs of vertices specified by the user. + + cugraph.cosine, in the absence of a specified vertex pair list, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for those vertex pairs. This is + not advisable as the vertex_pairs can grow exponentially with respect to the + size of the datasets. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertex_pair : cudf.DataFrame, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices. If provided, the cosine coefficient is computed for the + given vertex pairs. If the vertex_pair is not provided then the + current implementation computes the cosine coefficient for all + adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Cosine weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['cosine_coeff'] : cudf.Series + The computed Cosine coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import cosine + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = cosine(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertex_pair is None: + # Call two_hop neighbor of the entire graph + vertex_pair = input_graph.get_two_hop_neighbors() + + v_p_num_col = len(vertex_pair.columns) + + if isinstance(vertex_pair, cudf.DataFrame): + vertex_pair = renumber_vertex_pair(input_graph, vertex_pair) + vertex_pair = ensure_valid_dtype(input_graph, vertex_pair) + src_col_name = vertex_pair.columns[0] + dst_col_name = vertex_pair.columns[1] + first = vertex_pair[src_col_name] + second = vertex_pair[dst_col_name] + + elif vertex_pair is not None: + raise ValueError("vertex_pair must be a cudf Dataframe") + + first, second, cosine_coeff = pylibcugraph_cosine_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + first=first, + second=second, + use_weight=use_weight, + do_expensive_check=False, + ) + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, src_col_name, preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, dst_col_name, preserve_order=True + ) + + if v_p_num_col == 2: + # single column vertex + vertex_pair = vertex_pair.rename( + columns={src_col_name: "first", dst_col_name: "second"} + ) + + df = vertex_pair + df["cosine_coeff"] = cudf.Series(cosine_coeff) + + return df + + +def cosine_coefficient( + G: Union[Graph, "networkx.Graph"], + ebunch: Union[cudf.DataFrame, Iterable[Union[int, str, float]]] = None, +): + """ + Note: No NetworkX equivalent. + + Parameters + ---------- + G : cugraph.Graph or NetworkX.Graph + cuGraph or NetworkX Graph instance, should contain the connectivity + information as an edge list. The graph should be undirected where an + undirected edge is represented by a directed edge in both direction. + The adjacency list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + ebunch : cudf.DataFrame or iterable of node pairs, optional (default=None) + A GPU dataframe consisting of two columns representing pairs of + vertices or iterable of 2-tuples (u, v) where u and v are nodes in + the graph. + + If provided, the Overlap coefficient is computed for the given vertex + pairs. Otherwise, the current implementation computes the overlap + coefficient for all adjacent vertices in the graph. + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Cosine weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + the second vertex ID of each pair (will be identical to second if + specified). + df['cosine_coeff'] : cudf.Series + The computed Cosine coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import cosine_coefficient + >>> G = karate.get_graph(download=True) + >>> df = cosine_coefficient(G) + + """ + vertex_pair = None + + G, isNx = ensure_cugraph_obj_for_nx(G) + + if isNx is True and ebunch is not None: + vertex_pair = cudf.DataFrame(ebunch) + + df = cosine(G, vertex_pair) + + if isNx is True: + df = df_edge_score_to_dictionary( + df, k="cosine_coeff", src="first", dst="second" + ) + + return df + +def all_pairs_cosine( + input_graph: Graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None +): + """ + Compute the All Pairs Cosine similarity between all pairs of vertices specified. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. + + cugraph.all_pairs_cosine, in the absence of specified vertices, will + compute the two_hop_neighbors of the entire graph to construct a vertex pair + list and will return the cosine coefficient for all the vertex pairs in the graph. + This is not advisable as the vertex_pairs can grow exponentially with respect to + the size of the datasets. + + If the topk parameter is specified then the result will only contain the top k + highest scoring results. + + Parameters + ---------- + input_graph : cugraph.Graph + cuGraph Graph instance, should contain the connectivity information + as an edge list. The graph should be undirected where an undirected + edge is represented by a directed edge in both direction.The adjacency + list will be computed if not already present. + + This implementation only supports undirected, non-multi Graphs. + + vertices : int or list or cudf.Series or cudf.DataFrame, optional (default=None) + A GPU Series containing the input vertex list. If the vertex list is not + provided then the current implementation computes the cosine coefficient for + all adjacent vertices in the graph. + + use_weight : bool, optional (default=False) + Flag to indicate whether to compute weighted cosine (if use_weight==True) + or un-weighted cosine (if use_weight==False). + 'input_graph' must be weighted if 'use_weight=True'. + + topk : int, optional (default=None) + Specify the number of answers to return otherwise returns the entire + solution + + Returns + ------- + df : cudf.DataFrame + GPU data frame of size E (the default) or the size of the given pairs + (first, second) containing the Cosine weights. The ordering is + relative to the adjacency list, or that given by the specified vertex + pairs. + + df['first'] : cudf.Series + The first vertex ID of each pair (will be identical to first if specified). + df['second'] : cudf.Series + The second vertex ID of each pair (will be identical to second if + specified). + df['cosine_coeff'] : cudf.Series + The computed Cosine coefficient between the first and the second + vertex ID. + + Examples + -------- + >>> from cugraph.datasets import karate + >>> from cugraph import all_pairs_cosine + >>> input_graph = karate.get_graph(download=True, ignore_weights=True) + >>> df = all_pairs_cosine(input_graph) + + """ + if input_graph.is_directed(): + raise ValueError("Input must be an undirected Graph.") + + if vertices is not None: + + if isinstance(vertices, int): + vertices = [vertices] + + if isinstance(vertices, list): + vertices = cudf.Series( + vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype + ) + + if input_graph.renumbered is True: + if isinstance(vertices, cudf.DataFrame): + vertices = input_graph.lookup_internal_vertex_id( + vertices, vertices.columns + ) + else: + vertices = input_graph.lookup_internal_vertex_id(vertices) + + + + first, second, cosine_coeff = pylibcugraph_all_pairs_cosine_coefficients( + resource_handle=ResourceHandle(), + graph=input_graph._plc_graph, + vertices=vertices, + use_weight=use_weight, + topk=topk, + do_expensive_check=False, + ) + vertex_pair = cudf.DataFrame() + vertex_pair["first"] = first + vertex_pair["second"] = second + + if input_graph.renumbered: + vertex_pair = input_graph.unrenumber( + vertex_pair, "first", preserve_order=True + ) + vertex_pair = input_graph.unrenumber( + vertex_pair, "second", preserve_order=True + ) + + + df = vertex_pair + df["cosine_coeff"] = cudf.Series(cosine_coeff) + + return df From 9b0253f5d10bf96ff8642c1dbd9ef0aa8a7b2ab4 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 04:00:21 -0700 Subject: [PATCH 16/33] add MG tests for cosine similarity --- .../tests/link_prediction/test_cosine_mg.py | 275 ++++++++++++++++++ 1 file changed, 275 insertions(+) create mode 100644 python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py diff --git a/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py new file mode 100644 index 00000000000..88d292dec76 --- /dev/null +++ b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py @@ -0,0 +1,275 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import gc +import random + +import pytest + +import dask_cudf +import cugraph +import cugraph.dask as dcg +from cugraph.testing import utils +from pylibcugraph.testing import gen_fixture_params_product + + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= + + +def setup_function(): + gc.collect() + + +IS_DIRECTED = [False] +HAS_VERTEX_PAIR = [False, True] +HAS_VERTICES = [False, True] +HAS_TOPK = [False, True] +IS_WEIGHTED = [False, True] + + +# ============================================================================= +# Pytest fixtures +# ============================================================================= + +datasets = utils.DATASETS_UNDIRECTED + [ + utils.RAPIDS_DATASET_ROOT_DIR_PATH / "email-Eu-core.csv" +] + +fixture_params = gen_fixture_params_product( + (datasets, "graph_file"), + (IS_DIRECTED, "directed"), + (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), + (IS_WEIGHTED, "is_weighted"), +) + + +@pytest.fixture(scope="module", params=fixture_params) +def input_combo(request): + """ + Simply return the current combination of params as a dictionary for use in + tests or other parameterized fixtures. + """ + parameters = dict( + zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) + ) + + return parameters + + +@pytest.fixture(scope="module") +def input_expected_output(input_combo): + """ + This fixture returns the inputs and expected results from the Cosine algo. + (based on cuGraph Cosine) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertex_pair = input_combo["has_vertex_pair"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertex_pair: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + seeds = random.sample(range(G.number_of_vertices()), k) + + vertex_pair = G.get_two_hop_neighbors(start_vertices=seeds) + else: + vertex_pair = None + + input_combo["vertex_pair"] = vertex_pair + sg_cugraph_cosine = cugraph.cosine( + G, input_combo["vertex_pair"], use_weight=is_weighted + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_cosine + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + +@pytest.fixture(scope="module") +def input_expected_output_all_pairs(input_combo): + """ + This fixture returns the inputs and expected results from the Cosine algo. + (based on cuGraph Cosine) which can be used for validation. + """ + + input_data_path = input_combo["graph_file"] + directed = input_combo["directed"] + has_vertices = input_combo["has_vertices"] + has_topk = input_combo["has_topk"] + is_weighted = input_combo["is_weighted"] + G = utils.generate_cugraph_graph_from_file( + input_data_path, directed=directed, edgevals=is_weighted + ) + if has_vertices: + # Sample random vertices from the graph and compute the two_hop_neighbors + # with those seeds + k = random.randint(1, 10) + vertices = random.sample(range(G.number_of_vertices()), k) + + else: + vertices = None + + if has_topk: + topk = 5 + else: + topk = None + + input_combo["vertices"] = vertices + print("vertices ", vertices, " is_weighted = ", is_weighted) + input_combo["topk"] = topk + sg_cugraph_all_pairs_cosine = cugraph.all_pairs_cosine( + G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + ) + # Save the results back to the input_combo dictionary to prevent redundant + # cuGraph runs. Other tests using the input_combo fixture will look for + # them, and if not present they will have to re-run the same cuGraph call. + + input_combo["sg_cugraph_results"] = sg_cugraph_all_pairs_cosine + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + blocksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) + + dg = cugraph.Graph(directed=directed) + dg.from_dask_cudf_edgelist( + ddf, + source="src", + destination="dst", + edge_attr="value" if is_weighted else None, + renumber=True, + store_transposed=True, + ) + + input_combo["MGGraph"] = dg + + return input_combo + + +# ============================================================================= +# Tests +# ============================================================================= + + +@pytest.mark.mg +def test_dask_mg_cosine(dask_client, benchmark, input_expected_output): + + dg = input_expected_output["MGGraph"] + use_weight = input_expected_output["is_weighted"] + + result_cosine = benchmark( + dcg.cosine, dg, input_expected_output["vertex_pair"], use_weight=use_weight + ) + + result_cosine = ( + result_cosine.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"cosine_coeff": "mg_cugraph_cosine_coeff"}) + ) + + expected_output = ( + input_expected_output["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Cosine results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_cosine["sg_cugraph_cosine_coeff"] = expected_output["cosine_coeff"] + + cosine_coeff_diffs1 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff > 0.00001" + ) + cosine_coeff_diffs2 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff < -0.00001" + ) + + assert len(cosine_coeff_diffs1) == 0 + assert len(cosine_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_cosine(dask_client, benchmark, input_expected_output_all_pairs): + + dg = input_expected_output_all_pairs["MGGraph"] + + + use_weight = input_expected_output_all_pairs["is_weighted"] + + + result_cosine = benchmark( + dcg.all_pairs_cosine, dg, vertices=input_expected_output_all_pairs["vertices"], topk=input_expected_output_all_pairs["topk"], use_weight=use_weight + ) + + result_cosine = ( + result_cosine.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"cosine_coeff": "mg_cugraph_cosine_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph Cosine results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_cosine["sg_cugraph_cosine_coeff"] = expected_output["cosine_coeff"] + + cosine_coeff_diffs1 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff > 0.00001" + ) + cosine_coeff_diffs2 = result_cosine.query( + "mg_cugraph_cosine_coeff - sg_cugraph_cosine_coeff < -0.00001" + ) + + assert len(cosine_coeff_diffs1) == 0 + assert len(cosine_coeff_diffs2) == 0 From 8facad33ab1084d985b5a22263a5e89ac3f1e46c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 05:01:21 -0700 Subject: [PATCH 17/33] properly instantiate similarity functions --- cpp/include/cugraph_c/similarity_algorithms.h | 65 +++++++++++++++++++ python/cugraph/cugraph/__init__.py | 5 +- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 2 + .../_cugraph_c/similarity_algorithms.pxd | 4 +- .../all_pairs_cosine_coefficients.pyx | 6 +- .../pylibcugraph/cosine_coefficients.pyx | 6 +- 6 files changed, 79 insertions(+), 9 deletions(-) diff --git a/cpp/include/cugraph_c/similarity_algorithms.h b/cpp/include/cugraph_c/similarity_algorithms.h index 5b8462a1666..48f3ff5d52d 100644 --- a/cpp/include/cugraph_c/similarity_algorithms.h +++ b/cpp/include/cugraph_c/similarity_algorithms.h @@ -145,6 +145,33 @@ cugraph_error_code_t cugraph_overlap_coefficients(const cugraph_resource_handle_ cugraph_similarity_result_t** result, cugraph_error_t** error); +/** + * @brief Perform cosine similarity computation + * + * Compute the similarity for the specified vertex_pairs + * + * Note that cosine similarity must run on a symmetric graph. + * + * @param [in] handle Handle for accessing resources + * @param [in] graph Pointer to graph + * @param [in] vertex_pairs Vertex pair for input + * @param [in] use_weight If true consider the edge weight in the graph, if false use an + * edge weight of 1 + * @param [in] do_expensive_check A flag to run expensive checks for input arguments (if set to + * `true`). + * @param [out] result Opaque pointer to similarity results + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_cosine_similarity_coefficients(const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error); + /** * @brief Perform All-Pairs Jaccard similarity computation * @@ -259,6 +286,44 @@ cugraph_error_code_t cugraph_all_pairs_overlap_coefficients( cugraph_similarity_result_t** result, cugraph_error_t** error); +/** + * @brief Perform All Pairs cosine similarity computation + * + * Compute the similarity for all vertex pairs derived from the two-hop neighbors + * of an optional specified vertex list. This function will identify the two-hop + * neighbors of the specified vertices (all vertices in the graph if not specified) + * and compute similarity for those vertices. + * + * If the topk parameter is specified then the result will only contain the top k + * highest scoring results. + * + * Note that cosine similarity must run on a symmetric graph. + * + * @param [in] handle Handle for accessing resources + * @param [in] graph Pointer to graph + * @param [in] vertices Vertex list for input. If null then compute based on + * all vertices in the graph. + * @param [in] use_weight If true consider the edge weight in the graph, if false use an + * edge weight of 1 + * @param [in] topk Specify how many answers to return. Specifying SIZE_MAX + * will return all values. + * @param [in] do_expensive_check A flag to run expensive checks for input arguments (if set to + * `true`). + * @param [out] result Opaque pointer to similarity results + * @param [out] error Pointer to an error object storing details of any error. Will + * be populated if error code is not CUGRAPH_SUCCESS + * @return error code + */ +cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error); + #ifdef __cplusplus } #endif diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index d094bef027d..9cd8d32eb3c 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -82,7 +82,10 @@ all_pairs_overlap, sorensen, sorensen_coefficient, - all_pairs_sorensen + all_pairs_sorensen, + cosine, + cosine_coefficient, + all_pairs_cosine ) from cugraph.traversal import ( diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index 53fbb00f1c1..90fce23282e 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -39,6 +39,7 @@ set(cython_sources jaccard_coefficients.pyx sorensen_coefficients.pyx overlap_coefficients.pyx + cosine_coefficients.pyx katz_centrality.pyx leiden.pyx louvain.pyx @@ -61,6 +62,7 @@ set(cython_sources all_pairs_jaccard_coefficients.pyx all_pairs_sorensen_coefficients.pyx all_pairs_overlap_coefficients.pyx + all_pairs_cosine_coefficients.pyx ) set(linked_libraries cugraph::cugraph;cugraph::cugraph_c) diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index 061b7138c84..e969afee76f 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -138,7 +138,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": ########################################################################### # cosine coefficients cdef cugraph_error_code_t \ - cugraph_cosine_coefficients( + cugraph_cosine_similarity_coefficients( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, const cugraph_vertex_pairs_t* vertex_pairs, @@ -151,7 +151,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": ########################################################################### # all-pairs cosine coefficients cdef cugraph_error_code_t \ - cugraph_all_pairs_cosine_coefficients( + cugraph_all_pairs_cosine_similarity_coefficients( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, const cugraph_type_erased_device_array_view_t* vertices, diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx index 2ced2cc127b..0bf92b01614 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx @@ -39,7 +39,7 @@ from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, ) from pylibcugraph._cugraph_c.similarity_algorithms cimport ( - cugraph_all_pairs_cosine_coefficients, + cugraph_all_pairs_cosine_similarity_coefficients, cugraph_similarity_result_t, cugraph_similarity_result_get_similarity, cugraph_similarity_result_get_vertex_pairs, @@ -123,7 +123,7 @@ def all_pairs_cosine_coefficients(ResourceHandle resource_handle, create_cugraph_type_erased_device_array_view_from_py_obj( vertices) - error_code = cugraph_all_pairs_cosine_coefficients(c_resource_handle_ptr, + error_code = cugraph_all_pairs_cosine_similarity_coefficients(c_resource_handle_ptr, c_graph_ptr, vertices_view_ptr, use_weight, @@ -131,7 +131,7 @@ def all_pairs_cosine_coefficients(ResourceHandle resource_handle, do_expensive_check, &result_ptr, &error_ptr) - assert_success(error_code, error_ptr, "cugraph_all_pairs_cosine_coefficients") + assert_success(error_code, error_ptr, "cugraph_all_pairs_cosine_similarity_coefficients") # Extract individual device array pointers from result and copy to cupy # arrays for returning. diff --git a/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx index b5392e1e7e6..df194fe364e 100644 --- a/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/cosine_coefficients.pyx @@ -41,7 +41,7 @@ from pylibcugraph._cugraph_c.graph cimport ( cugraph_graph_t, ) from pylibcugraph._cugraph_c.similarity_algorithms cimport ( - cugraph_cosine_coefficients, + cugraph_cosine_similarity_coefficients, cugraph_similarity_result_t, cugraph_similarity_result_get_similarity, cugraph_similarity_result_free @@ -135,14 +135,14 @@ def cosine_coefficients(ResourceHandle resource_handle, &error_ptr) assert_success(error_code, error_ptr, "vertex_pairs") - error_code = cugraph_cosine_coefficients(c_resource_handle_ptr, + error_code = cugraph_cosine_similarity_coefficients(c_resource_handle_ptr, c_graph_ptr, vertex_pairs_ptr, use_weight, do_expensive_check, &result_ptr, &error_ptr) - assert_success(error_code, error_ptr, "cugraph_cosine_coefficients") + assert_success(error_code, error_ptr, "cugraph_cosine_similarity_coefficients") # Extract individual device array pointers from result and copy to cupy # arrays for returning. From 30a3db6e47c6a826a6e8c16976b9f3bb8ef25061 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 05:47:36 -0700 Subject: [PATCH 18/33] add C tests for cosine similarity algo --- cpp/src/c_api/similarity.cpp | 71 +++++++++++++++++++++++++++++++ cpp/tests/c_api/similarity_test.c | 65 +++++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 1 deletion(-) diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp index 9b5a6a8c885..071f77e3172 100644 --- a/cpp/src/c_api/similarity.cpp +++ b/cpp/src/c_api/similarity.cpp @@ -340,6 +340,32 @@ struct overlap_functor { } }; +struct cosine_similarity_functor { + template + rmm::device_uvector operator()( + raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::tuple, raft::device_span> vertex_pairs) + { + return cugraph::cosine_similarity_coefficients(handle, graph_view, edge_weight_view, vertex_pairs); + } + + template + std::tuple, + rmm::device_uvector, + rmm::device_uvector> + operator()(raft::handle_t const& handle, + cugraph::graph_view_t const& graph_view, + std::optional> edge_weight_view, + std::optional> vertices, + std::optional topk) + { + return cugraph::cosine_similarity_all_pairs_coefficients( + handle, graph_view, edge_weight_view, vertices, topk); + } +}; + } // namespace extern "C" cugraph_type_erased_device_array_view_t* cugraph_similarity_result_get_similarity( @@ -431,6 +457,28 @@ extern "C" cugraph_error_code_t cugraph_overlap_coefficients( return cugraph::c_api::run_algorithm(graph, functor, result, error); } +extern "C" cugraph_error_code_t cugraph_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error) +{ + if (use_weight) { + CAPI_EXPECTS( + reinterpret_cast(graph)->edge_weights_ != nullptr, + CUGRAPH_INVALID_INPUT, + "use_weight is true but edge weights are not provided.", + *error); + } + similarity_functor functor( + handle, graph, vertex_pairs, cosine_similarity_functor{}, use_weight, do_expensive_check); + + return cugraph::c_api::run_algorithm(graph, functor, result, error); +} + extern "C" cugraph_error_code_t cugraph_all_pairs_jaccard_coefficients( const cugraph_resource_handle_t* handle, cugraph_graph_t* graph, @@ -497,5 +545,28 @@ extern "C" cugraph_error_code_t cugraph_all_pairs_overlap_coefficients( all_pairs_similarity_functor functor( handle, graph, vertices, overlap_functor{}, use_weight, topk, do_expensive_check); + return cugraph::c_api::run_algorithm(graph, functor, result, error); +} + +extern "C" cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error) +{ + if (use_weight) { + CAPI_EXPECTS( + reinterpret_cast(graph)->edge_weights_ != nullptr, + CUGRAPH_INVALID_INPUT, + "use_weight is true but edge weights are not provided.", + *error); + } + all_pairs_similarity_functor functor( + handle, graph, vertices, overlap_functor{}, use_weight, topk, do_expensive_check); + return cugraph::c_api::run_algorithm(graph, functor, result, error); } \ No newline at end of file diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index c29af658ce9..960fc0a9617 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -26,7 +26,7 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP } similarity_t; +typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE} similarity_t; int generic_similarity_test(vertex_t* h_src, vertex_t* h_dst, @@ -101,6 +101,10 @@ int generic_similarity_test(vertex_t* h_src, ret_code = cugraph_overlap_coefficients( handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); break; + case COSINE: + ret_code = cugraph_cosine_similarity_coefficients( + handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -405,6 +409,62 @@ int test_weighted_overlap() OVERLAP); } +int test_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + COSINE); +} + +int test_weighted_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; + //weight_t h_result[] = {0.714286, 0.416667, 0.000000}; + + return generic_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + COSINE); +} + int test_all_pairs_jaccard() { size_t num_edges = 16; @@ -817,6 +877,8 @@ int test_weighted_all_pairs_overlap_topk() int main(int argc, char** argv) { int result = 0; + result |= RUN_TEST(test_cosine); + #if 0 result |= RUN_TEST(test_jaccard); result |= RUN_TEST(test_sorensen); result |= RUN_TEST(test_overlap); @@ -835,5 +897,6 @@ int main(int argc, char** argv) result |= RUN_TEST(test_weighted_all_pairs_jaccard_topk); result |= RUN_TEST(test_weighted_all_pairs_sorensen_topk); result |= RUN_TEST(test_weighted_all_pairs_overlap_topk); + #endif return result; } From 41cb9b163d91805fbcf8e86bc8efa02d790ed9ba Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 10:18:28 -0700 Subject: [PATCH 19/33] update similarity tests --- cpp/tests/c_api/similarity_test.c | 173 ++++++++++++++++++++++++++++-- 1 file changed, 164 insertions(+), 9 deletions(-) diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index 960fc0a9617..a63bc4b3df9 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -183,6 +183,10 @@ int generic_all_pairs_similarity_test(vertex_t* h_src, ret_code = cugraph_all_pairs_overlap_coefficients( handle, graph, vertices_view, use_weight, topk, FALSE, &result, &ret_error); break; + case COSINE: + ret_code = cugraph_all_pairs_cosine_similarity_coefficients( + handle, graph, vertices_view, use_weight, topk, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -337,7 +341,7 @@ int test_weighted_sorensen() vertex_t h_first[] = {0, 0, 1}; vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.526316, 0.344828, 0.000000}; + weight_t h_result[] = {0.526316, 0.344828, 0.0}; return generic_similarity_test(h_src, h_dst, @@ -393,7 +397,7 @@ int test_weighted_overlap() vertex_t h_first[] = {0, 0, 1}; vertex_t h_second[] = {1, 2, 3}; - weight_t h_result[] = {0.714286, 0.416667, 0.000000}; + weight_t h_result[] = {0.714286, 0.416667, 0.0}; return generic_similarity_test(h_src, h_dst, @@ -440,16 +444,16 @@ int test_weighted_cosine() { size_t num_edges = 16; size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_pairs = 2; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; - //weight_t h_result[] = {0.714286, 0.416667, 0.000000}; + vertex_t h_first[] = {0, 0}; + vertex_t h_second[] = {1, 2}; + weight_t h_result[] = {0.990830, 0.976187}; return generic_similarity_test(h_src, h_dst, @@ -465,6 +469,152 @@ int test_weighted_cosine() COSINE); } + + +int test_all_pairs_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 22; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; + vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + SIZE_MAX, + COSINE); +} + +int test_weighted_all_pairs_cosine_topk() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 6; + size_t topk = 6; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 1, 1, 2, 3, 4}; + vertex_t h_second[] = {1, 0, 2, 1, 4, 3}; + weight_t h_result[] = {0.0, 0.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + topk, + COSINE); +} + +int test_all_pairs_cosine_topk() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t topk = 6; + size_t num_pairs = 6; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 1, 3, 3, 4, 5}; + vertex_t h_second[] = {3, 5, 0, 4, 3, 1}; + weight_t h_result[] = {1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + topk, + COSINE); +} + + + +int test_weighted_all_pairs_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 16; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6}; + vertex_t h_second[] = {1, 2, 0, 2, 0, 1, 4, 5, 6, 3, 5, 6, 3, 4, 3, 4}; + weight_t h_result[] = {0.714286, + 0.416667, + 0.714286, + 1, + 0.416667, + 1, + 1, + 0.166667, + 0.5, + 1, + 0.571429, + 0.75, + 0.166667, + 0.571429, + 0.5, + 0.75}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + SIZE_MAX, + COSINE); +} + + + + + + + + + + int test_all_pairs_jaccard() { size_t num_edges = 16; @@ -877,26 +1027,31 @@ int test_weighted_all_pairs_overlap_topk() int main(int argc, char** argv) { int result = 0; - result |= RUN_TEST(test_cosine); - #if 0 + result |= RUN_TEST(test_jaccard); result |= RUN_TEST(test_sorensen); result |= RUN_TEST(test_overlap); + result |= RUN_TEST(test_cosine); result |= RUN_TEST(test_weighted_jaccard); result |= RUN_TEST(test_weighted_sorensen); result |= RUN_TEST(test_weighted_overlap); + result |= RUN_TEST(test_weighted_cosine); result |= RUN_TEST(test_all_pairs_jaccard); result |= RUN_TEST(test_all_pairs_sorensen); result |= RUN_TEST(test_all_pairs_overlap); + result |= RUN_TEST(test_all_pairs_cosine); result |= RUN_TEST(test_weighted_all_pairs_jaccard); result |= RUN_TEST(test_weighted_all_pairs_sorensen); result |= RUN_TEST(test_weighted_all_pairs_overlap); + result |= RUN_TEST(test_weighted_all_pairs_cosine); result |= RUN_TEST(test_all_pairs_jaccard_topk); result |= RUN_TEST(test_all_pairs_sorensen_topk); result |= RUN_TEST(test_all_pairs_overlap_topk); + result |= RUN_TEST(test_all_pairs_cosine_topk); result |= RUN_TEST(test_weighted_all_pairs_jaccard_topk); result |= RUN_TEST(test_weighted_all_pairs_sorensen_topk); result |= RUN_TEST(test_weighted_all_pairs_overlap_topk); - #endif + result |= RUN_TEST(test_weighted_all_pairs_cosine_topk); + return result; } From f9cafa185ec7d8025c1e3739753b8ffb71401d7b Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 11:12:28 -0700 Subject: [PATCH 20/33] update MG similarity CAPI tests --- cpp/tests/c_api/mg_similarity_test.c | 227 +++++++++++++++++++++- cpp/tests/c_api/similarity_test.c | 281 +++++++++++++-------------- 2 files changed, 361 insertions(+), 147 deletions(-) diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index 1d65a161594..b53788d01d6 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -26,7 +26,7 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP, ALL_PAIRS_JACCARD, ALL_PAIRS_SORENSEN, ALL_PAIRS_OVERLAP } similarity_t; +typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE, ALL_PAIRS_JACCARD, ALL_PAIRS_SORENSEN, ALL_PAIRS_OVERLAP, ALL_PAIRS_COSINE } similarity_t; int generic_similarity_test(const cugraph_resource_handle_t* handle, vertex_t* h_src, @@ -135,6 +135,14 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, ret_code = cugraph_all_pairs_overlap_coefficients( handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); break; + case COSINE: + ret_code = cugraph_cosine_similarity_coefficients( + handle, graph, vertex_pairs, use_weight, FALSE, &result, &ret_error); + break; + case ALL_PAIRS_COSINE: + ret_code = cugraph_all_pairs_cosine_similarity_coefficients( + handle, graph, start_v_view, use_weight, topk, FALSE, &result, &ret_error); + break; } TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); @@ -154,6 +162,9 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, case ALL_PAIRS_OVERLAP: num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); break; + case ALL_PAIRS_COSINE: + num_pairs = cugraph_type_erased_device_array_view_size(similarity_coefficient); + break; } weight_t h_similarity_coefficient[num_pairs]; @@ -694,6 +705,213 @@ int test_all_pairs_overlap_with_topk(const cugraph_resource_handle_t* handle) FALSE, ALL_PAIRS_OVERLAP); } + + + + + + + + + + + + +int test_cosine(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + COSINE); +} + +int test_weighted_cosine(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 2; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0}; + vertex_t h_second[] = {1, 2}; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.990830, 0.976187}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + TRUE, + COSINE); +} + +int test_all_pairs_cosine(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 0; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_COSINE); +} + +int test_all_pairs_cosine_with_start_vertices(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_COSINE); +} + +int test_all_pairs_cosine_with_topk(const cugraph_resource_handle_t* handle) +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; + size_t num_start_vertices = 3; + size_t topk = 5; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t* h_start_vertices = NULL; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_similarity_test(handle, + h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_start_vertices, + h_result, + num_vertices, + num_edges, + num_pairs, + num_start_vertices, + topk, + FALSE, + FALSE, + ALL_PAIRS_COSINE); +} + + + + + + + + + + + + + + + + + + + + + + + + /******************************************************************************/ @@ -704,6 +922,7 @@ int main(int argc, char** argv) cugraph_resource_handle_t* handle = cugraph_create_resource_handle(raft_handle); int result = 0; + result |= RUN_MG_TEST(test_jaccard, handle); result |= RUN_MG_TEST(test_weighted_jaccard, handle); result |= RUN_MG_TEST(test_all_pairs_jaccard, handle); @@ -722,6 +941,12 @@ int main(int argc, char** argv) result |= RUN_MG_TEST(test_all_pairs_overlap_with_start_vertices, handle); result |= RUN_MG_TEST(test_all_pairs_overlap_with_topk, handle); + result |= RUN_MG_TEST(test_cosine, handle); + result |= RUN_MG_TEST(test_weighted_cosine, handle); + result |= RUN_MG_TEST(test_all_pairs_cosine, handle); + result |= RUN_MG_TEST(test_all_pairs_cosine_with_start_vertices, handle); + result |= RUN_MG_TEST(test_all_pairs_cosine_with_topk, handle); + cugraph_free_resource_handle(handle); free_mg_raft_handle(raft_handle); diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index a63bc4b3df9..0d544ff82d6 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -469,152 +469,6 @@ int test_weighted_cosine() COSINE); } - - -int test_all_pairs_cosine() -{ - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 22; - - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; - vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; - weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; - - return generic_all_pairs_similarity_test(h_src, - h_dst, - h_wgt, - h_first, - h_second, - h_result, - num_vertices, - num_edges, - num_pairs, - FALSE, - FALSE, - SIZE_MAX, - COSINE); -} - -int test_weighted_all_pairs_cosine_topk() -{ - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 6; - size_t topk = 6; - - vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; - vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; - weight_t h_wgt[] = { - 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - - vertex_t h_first[] = {0, 1, 1, 2, 3, 4}; - vertex_t h_second[] = {1, 0, 2, 1, 4, 3}; - weight_t h_result[] = {0.0, 0.0, 1.0, 1.0, 1.0, 1.0}; - - return generic_all_pairs_similarity_test(h_src, - h_dst, - h_wgt, - h_first, - h_second, - h_result, - num_vertices, - num_edges, - num_pairs, - FALSE, - TRUE, - topk, - COSINE); -} - -int test_all_pairs_cosine_topk() -{ - size_t num_edges = 16; - size_t num_vertices = 6; - size_t topk = 6; - size_t num_pairs = 6; - - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 1, 3, 3, 4, 5}; - vertex_t h_second[] = {3, 5, 0, 4, 3, 1}; - weight_t h_result[] = {1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000}; - - return generic_all_pairs_similarity_test(h_src, - h_dst, - h_wgt, - h_first, - h_second, - h_result, - num_vertices, - num_edges, - num_pairs, - FALSE, - FALSE, - topk, - COSINE); -} - - - -int test_weighted_all_pairs_cosine() -{ - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 16; - - vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; - vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; - weight_t h_wgt[] = { - 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - - vertex_t h_first[] = {0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6}; - vertex_t h_second[] = {1, 2, 0, 2, 0, 1, 4, 5, 6, 3, 5, 6, 3, 4, 3, 4}; - weight_t h_result[] = {0.714286, - 0.416667, - 0.714286, - 1, - 0.416667, - 1, - 1, - 0.166667, - 0.5, - 1, - 0.571429, - 0.75, - 0.166667, - 0.571429, - 0.5, - 0.75}; - - return generic_all_pairs_similarity_test(h_src, - h_dst, - h_wgt, - h_first, - h_second, - h_result, - num_vertices, - num_edges, - num_pairs, - FALSE, - TRUE, - SIZE_MAX, - COSINE); -} - - - - - - - - - - int test_all_pairs_jaccard() { size_t num_edges = 16; @@ -841,6 +695,65 @@ int test_weighted_all_pairs_overlap() OVERLAP); } +int test_all_pairs_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 22; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; + vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + SIZE_MAX, + COSINE); +} + +int test_weighted_all_pairs_cosine_topk() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 6; + size_t topk = 6; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 1, 1, 2, 3, 4}; + vertex_t h_second[] = {1, 0, 2, 1, 4, 3}; + weight_t h_result[] = {0.0, 0.0, 1.0, 1.0, 1.0, 1.0}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + topk, + COSINE); +} + int test_all_pairs_jaccard_topk() { size_t num_edges = 16; @@ -1022,6 +935,82 @@ int test_weighted_all_pairs_overlap_topk() OVERLAP); } +int test_all_pairs_cosine_topk() +{ + size_t num_edges = 16; + size_t num_vertices = 6; + size_t topk = 6; + size_t num_pairs = 6; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 1, 3, 3, 4, 5}; + vertex_t h_second[] = {3, 5, 0, 4, 3, 1}; + weight_t h_result[] = {1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + FALSE, + topk, + COSINE); +} + + + +int test_weighted_all_pairs_cosine() +{ + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 16; + + vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; + vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; + weight_t h_wgt[] = { + 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; + + vertex_t h_first[] = {0, 0, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 6, 6}; + vertex_t h_second[] = {1, 2, 0, 2, 0, 1, 4, 5, 6, 3, 5, 6, 3, 4, 3, 4}; + weight_t h_result[] = {0.714286, + 0.416667, + 0.714286, + 1, + 0.416667, + 1, + 1, + 0.166667, + 0.5, + 1, + 0.571429, + 0.75, + 0.166667, + 0.571429, + 0.5, + 0.75}; + + return generic_all_pairs_similarity_test(h_src, + h_dst, + h_wgt, + h_first, + h_second, + h_result, + num_vertices, + num_edges, + num_pairs, + FALSE, + TRUE, + SIZE_MAX, + COSINE); +} + /******************************************************************************/ int main(int argc, char** argv) From 1aec8e4ade3c8109fe7da3ac89aa144757f2bb19 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 11:17:34 -0700 Subject: [PATCH 21/33] fix style --- cpp/include/cugraph_c/similarity_algorithms.h | 15 +- cpp/src/c_api/similarity.cpp | 38 +- cpp/tests/c_api/mg_similarity_test.c | 484 +++++++++--------- cpp/tests/c_api/similarity_test.c | 10 +- python/cugraph/cugraph/__init__.py | 2 +- .../cugraph/dask/link_prediction/cosine.py | 17 +- .../cugraph/dask/link_prediction/jaccard.py | 17 +- .../cugraph/dask/link_prediction/overlap.py | 19 +- .../cugraph/dask/link_prediction/sorensen.py | 17 +- .../cugraph/cugraph/link_prediction/cosine.py | 21 +- .../cugraph/link_prediction/jaccard.py | 21 +- .../cugraph/link_prediction/overlap.py | 22 +- .../cugraph/link_prediction/sorensen.py | 20 +- .../tests/link_prediction/test_cosine_mg.py | 31 +- .../tests/link_prediction/test_jaccard.py | 55 +- .../tests/link_prediction/test_jaccard_mg.py | 31 +- .../tests/link_prediction/test_overlap.py | 55 +- .../tests/link_prediction/test_overlap_mg.py | 19 +- .../tests/link_prediction/test_sorensen.py | 55 +- .../tests/link_prediction/test_sorensen_mg.py | 19 +- .../_cugraph_c/similarity_algorithms.pxd | 12 +- .../all_pairs_cosine_coefficients.pyx | 4 +- .../all_pairs_jaccard_coefficients.pyx | 4 +- .../all_pairs_overlap_coefficients.pyx | 4 +- .../all_pairs_sorensen_coefficients.pyx | 4 +- 25 files changed, 545 insertions(+), 451 deletions(-) diff --git a/cpp/include/cugraph_c/similarity_algorithms.h b/cpp/include/cugraph_c/similarity_algorithms.h index 48f3ff5d52d..12f55132fc7 100644 --- a/cpp/include/cugraph_c/similarity_algorithms.h +++ b/cpp/include/cugraph_c/similarity_algorithms.h @@ -164,13 +164,14 @@ cugraph_error_code_t cugraph_overlap_coefficients(const cugraph_resource_handle_ * be populated if error code is not CUGRAPH_SUCCESS * @return error code */ -cugraph_error_code_t cugraph_cosine_similarity_coefficients(const cugraph_resource_handle_t* handle, - cugraph_graph_t* graph, - const cugraph_vertex_pairs_t* vertex_pairs, - bool_t use_weight, - bool_t do_expensive_check, - cugraph_similarity_result_t** result, - cugraph_error_t** error); +cugraph_error_code_t cugraph_cosine_similarity_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_vertex_pairs_t* vertex_pairs, + bool_t use_weight, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error); /** * @brief Perform All-Pairs Jaccard similarity computation diff --git a/cpp/src/c_api/similarity.cpp b/cpp/src/c_api/similarity.cpp index 071f77e3172..36f1a74f3e0 100644 --- a/cpp/src/c_api/similarity.cpp +++ b/cpp/src/c_api/similarity.cpp @@ -211,20 +211,22 @@ struct all_pairs_similarity_functor : public cugraph::c_api::abstract_functor { vertices_->as_type(), vertices_->size_}) : std::nullopt, topk_ != SIZE_MAX ? std::make_optional(topk_) : std::nullopt); - - cugraph::unrenumber_int_vertices(handle_, - v1.data(), - v1.size(), - number_map->data(), - graph_view.vertex_partition_range_lasts(), - false); - - cugraph::unrenumber_int_vertices(handle_, - v2.data(), - v2.size(), - number_map->data(), - graph_view.vertex_partition_range_lasts(), - false); + + cugraph::unrenumber_int_vertices( + handle_, + v1.data(), + v1.size(), + number_map->data(), + graph_view.vertex_partition_range_lasts(), + false); + + cugraph::unrenumber_int_vertices( + handle_, + v2.data(), + v2.size(), + number_map->data(), + graph_view.vertex_partition_range_lasts(), + false); result_ = new cugraph::c_api::cugraph_similarity_result_t{ new cugraph::c_api::cugraph_type_erased_device_array_t(similarity_coefficients, @@ -296,7 +298,8 @@ struct cosine_functor { std::optional> edge_weight_view, std::tuple, raft::device_span> vertex_pairs) { - return cugraph::cosine_similarity_coefficients(handle, graph_view, edge_weight_view, vertex_pairs); + return cugraph::cosine_similarity_coefficients( + handle, graph_view, edge_weight_view, vertex_pairs); } template @@ -348,7 +351,8 @@ struct cosine_similarity_functor { std::optional> edge_weight_view, std::tuple, raft::device_span> vertex_pairs) { - return cugraph::cosine_similarity_coefficients(handle, graph_view, edge_weight_view, vertex_pairs); + return cugraph::cosine_similarity_coefficients( + handle, graph_view, edge_weight_view, vertex_pairs); } template @@ -569,4 +573,4 @@ extern "C" cugraph_error_code_t cugraph_all_pairs_cosine_similarity_coefficients handle, graph, vertices, overlap_functor{}, use_weight, topk, do_expensive_check); return cugraph::c_api::run_algorithm(graph, functor, result, error); -} \ No newline at end of file +} diff --git a/cpp/tests/c_api/mg_similarity_test.c b/cpp/tests/c_api/mg_similarity_test.c index b53788d01d6..486ca34aaca 100644 --- a/cpp/tests/c_api/mg_similarity_test.c +++ b/cpp/tests/c_api/mg_similarity_test.c @@ -26,7 +26,16 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE, ALL_PAIRS_JACCARD, ALL_PAIRS_SORENSEN, ALL_PAIRS_OVERLAP, ALL_PAIRS_COSINE } similarity_t; +typedef enum { + JACCARD, + SORENSEN, + OVERLAP, + COSINE, + ALL_PAIRS_JACCARD, + ALL_PAIRS_SORENSEN, + ALL_PAIRS_OVERLAP, + ALL_PAIRS_COSINE +} similarity_t; int generic_similarity_test(const cugraph_resource_handle_t* handle, vertex_t* h_src, @@ -51,14 +60,14 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, cugraph_error_code_t ret_code = CUGRAPH_SUCCESS; cugraph_error_t* ret_error; - cugraph_graph_t* graph = NULL; - cugraph_similarity_result_t* result = NULL; - cugraph_vertex_pairs_t* vertex_pairs = NULL; - cugraph_type_erased_device_array_t* v1 = NULL; - cugraph_type_erased_device_array_t* v2 = NULL; - cugraph_type_erased_device_array_t* start_v = NULL; - cugraph_type_erased_device_array_view_t* v1_view = NULL; - cugraph_type_erased_device_array_view_t* v2_view = NULL; + cugraph_graph_t* graph = NULL; + cugraph_similarity_result_t* result = NULL; + cugraph_vertex_pairs_t* vertex_pairs = NULL; + cugraph_type_erased_device_array_t* v1 = NULL; + cugraph_type_erased_device_array_t* v2 = NULL; + cugraph_type_erased_device_array_t* start_v = NULL; + cugraph_type_erased_device_array_view_t* v1_view = NULL; + cugraph_type_erased_device_array_view_t* v2_view = NULL; cugraph_type_erased_device_array_view_t* start_v_view = NULL; ret_code = create_test_graph( @@ -67,12 +76,11 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create_test_graph failed."); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, cugraph_error_message(ret_error)); - if (topk == 0) { topk = SIZE_MAX;} + if (topk == 0) { topk = SIZE_MAX; } - if (cugraph_resource_handle_get_rank(handle) != 0) { num_pairs = 0;} + if (cugraph_resource_handle_get_rank(handle) != 0) { num_pairs = 0; } if (h_first != NULL && h_second != NULL) { - ret_code = cugraph_type_erased_device_array_create(handle, num_pairs, vertex_tid, &v1, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); @@ -97,17 +105,17 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "create vertex pairs failed."); } - if (h_start_vertices != NULL) { - ret_code = - cugraph_type_erased_device_array_create(handle, num_start_vertices, vertex_tid, &start_v, &ret_error); + ret_code = cugraph_type_erased_device_array_create( + handle, num_start_vertices, vertex_tid, &start_v, &ret_error); TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "v1 create failed."); start_v_view = cugraph_type_erased_device_array_view(start_v); ret_code = cugraph_type_erased_device_array_view_copy_from_host( handle, start_v_view, (byte_t*)h_start_vertices, &ret_error); - TEST_ASSERT(test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_start_vertices copy_from_host failed."); + TEST_ASSERT( + test_ret_value, ret_code == CUGRAPH_SUCCESS, "h_start_vertices copy_from_host failed."); } switch (test_type) { @@ -179,7 +187,6 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, "similarity results don't match"); } - if (result != NULL) cugraph_similarity_result_free(result); if (vertex_pairs != NULL) cugraph_vertex_pairs_free(vertex_pairs); cugraph_mg_graph_free(graph); @@ -190,17 +197,17 @@ int generic_similarity_test(const cugraph_resource_handle_t* handle, int test_jaccard(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.2, 0.666667, 0.333333, 0.4, 0.166667, 0.5, 0.2, 0.25, 0.25, 0.666667}; @@ -224,21 +231,21 @@ int test_jaccard(const cugraph_resource_handle_t* handle) int test_weighted_jaccard(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.357143, 0.208333, 0.0}; + weight_t h_result[] = {0.357143, 0.208333, 0.0}; return generic_similarity_test(handle, h_src, @@ -260,19 +267,21 @@ int test_weighted_jaccard(const cugraph_resource_handle_t* handle) int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25, 0.666667, 0.166667, 0.2, 0.666667, 0.3333333, 0.25, 0.666667, 0.5, 0.25}; + weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, + 0.25, 0.4, 0.2, 0.25, 0.25, 0.666667, 0.166667, 0.2, + 0.666667, 0.3333333, 0.25, 0.666667, 0.5, 0.25}; return generic_similarity_test(handle, h_src, @@ -294,19 +303,20 @@ int test_all_pairs_jaccard(const cugraph_resource_handle_t* handle) int test_all_pairs_jaccard_with_start_vertices(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; - vertex_t h_start_vertices[] = {0, 1, 2}; - weight_t h_result[] = {0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = { + 0.2, 0.25, 0.666667, 0.333333, 0.2, 0.4, 0.166667, 0.5, 0.25, 0.4, 0.2, 0.25, 0.25}; return generic_similarity_test(handle, h_src, @@ -328,19 +338,19 @@ int test_all_pairs_jaccard_with_start_vertices(const cugraph_resource_handle_t* int test_all_pairs_jaccard_with_topk(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 5; + size_t topk = 5; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.666667, 0.666667, 0.666667, 0.666667, 0.5}; + weight_t h_result[] = {0.666667, 0.666667, 0.666667, 0.666667, 0.5}; return generic_similarity_test(handle, h_src, @@ -360,21 +370,19 @@ int test_all_pairs_jaccard_with_topk(const cugraph_resource_handle_t* handle) ALL_PAIRS_JACCARD); } - - int test_sorensen(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; vertex_t* h_start_vertices = NULL; weight_t h_result[] = {0.333333, 0.8, 0.5, 0.571429, 0.285714, 0.666667, 0.333333, 0.4, 0.4, 0.8}; @@ -398,21 +406,21 @@ int test_sorensen(const cugraph_resource_handle_t* handle) int test_weighted_sorensen(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.526316, 0.344828, 0.000000}; + weight_t h_result[] = {0.526316, 0.344828, 0.000000}; return generic_similarity_test(handle, h_src, @@ -434,19 +442,21 @@ int test_weighted_sorensen(const cugraph_resource_handle_t* handle) int test_all_pairs_sorensen(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.333333, 0.4, 0.8, 0.5, 0.333333, 0.571429, 0.285714, 0.666667, 0.4, 0.571429, 0.333333, 0.4, 0.4, 0.8, 0.285714, 0.333333, 0.8, 0.5, 0.4, 0.8, 0.666667, 0.4}; + weight_t h_result[] = {0.333333, 0.4, 0.8, 0.5, 0.333333, 0.571429, 0.285714, 0.666667, + 0.4, 0.571429, 0.333333, 0.4, 0.4, 0.8, 0.285714, 0.333333, + 0.8, 0.5, 0.4, 0.8, 0.666667, 0.4}; return generic_similarity_test(handle, h_src, @@ -468,19 +478,31 @@ int test_all_pairs_sorensen(const cugraph_resource_handle_t* handle) int test_all_pairs_sorensen_with_start_vertices(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 0; - - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; - vertex_t h_start_vertices[] = {0, 1, 2}; - weight_t h_result[] = {0.333333, 0.4, 0.8, 0.5, 0.333333, 0.571429, 0.285714, 0.666667, 0.4, 0.571429, 0.333333, 0.4, 0.4}; + size_t topk = 0; + + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = {0.333333, + 0.4, + 0.8, + 0.5, + 0.333333, + 0.571429, + 0.285714, + 0.666667, + 0.4, + 0.571429, + 0.333333, + 0.4, + 0.4}; return generic_similarity_test(handle, h_src, @@ -502,19 +524,19 @@ int test_all_pairs_sorensen_with_start_vertices(const cugraph_resource_handle_t* int test_all_pairs_sorensen_with_topk(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 5; + size_t topk = 5; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.8, 0.8, 0.8, 0.8, 0.666667}; + weight_t h_result[] = {0.8, 0.8, 0.8, 0.8, 0.666667}; return generic_similarity_test(handle, h_src, @@ -536,19 +558,19 @@ int test_all_pairs_sorensen_with_topk(const cugraph_resource_handle_t* handle) int test_overlap(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1}; + weight_t h_result[] = {0.5, 1, 0.5, 0.666667, 0.333333, 1, 0.333333, 0.5, 0.5, 1}; return generic_similarity_test(handle, h_src, @@ -570,21 +592,21 @@ int test_overlap(const cugraph_resource_handle_t* handle) int test_weighted_overlap(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 3; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 3; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0, 1}; - vertex_t h_second[] = {1, 2, 3}; + vertex_t h_first[] = {0, 0, 1}; + vertex_t h_second[] = {1, 2, 3}; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.714286, 0.416667, 0.000000}; + weight_t h_result[] = {0.714286, 0.416667, 0.000000}; return generic_similarity_test(handle, h_src, @@ -606,19 +628,21 @@ int test_weighted_overlap(const cugraph_resource_handle_t* handle) int test_all_pairs_overlap(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, + 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, + 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; return generic_similarity_test(handle, h_src, @@ -640,19 +664,20 @@ int test_all_pairs_overlap(const cugraph_resource_handle_t* handle) int test_all_pairs_overlap_with_start_vertices(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; - vertex_t h_start_vertices[] = {0, 1, 2}; - weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = { + 0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; return generic_similarity_test(handle, h_src, @@ -674,19 +699,19 @@ int test_all_pairs_overlap_with_start_vertices(const cugraph_resource_handle_t* int test_all_pairs_overlap_with_topk(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 5; + size_t topk = 5; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; return generic_similarity_test(handle, h_src, @@ -706,32 +731,21 @@ int test_all_pairs_overlap_with_topk(const cugraph_resource_handle_t* handle) ALL_PAIRS_OVERLAP); } - - - - - - - - - - - int test_cosine(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 10; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 10; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; - vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t h_first[] = {0, 0, 0, 1, 1, 1, 2, 2, 2, 3}; + vertex_t h_second[] = {1, 3, 4, 2, 3, 5, 3, 4, 5, 4}; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; return generic_similarity_test(handle, h_src, @@ -753,21 +767,21 @@ int test_cosine(const cugraph_resource_handle_t* handle) int test_weighted_cosine(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 7; - size_t num_pairs = 2; + size_t num_edges = 16; + size_t num_vertices = 7; + size_t num_pairs = 2; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; vertex_t h_src[] = {0, 1, 2, 0, 1, 2, 3, 3, 3, 4, 4, 4, 0, 5, 2, 6}; vertex_t h_dst[] = {3, 3, 3, 4, 4, 4, 0, 1, 2, 0, 1, 2, 5, 0, 6, 2}; weight_t h_wgt[] = { 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 3.5, 4.0, 4.0}; - vertex_t h_first[] = {0, 0}; - vertex_t h_second[] = {1, 2}; + vertex_t h_first[] = {0, 0}; + vertex_t h_second[] = {1, 2}; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.990830, 0.976187}; + weight_t h_result[] = {0.990830, 0.976187}; return generic_similarity_test(handle, h_src, @@ -789,19 +803,21 @@ int test_weighted_cosine(const cugraph_resource_handle_t* handle) int test_all_pairs_cosine(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 0; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, + 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, + 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; return generic_similarity_test(handle, h_src, @@ -823,19 +839,20 @@ int test_all_pairs_cosine(const cugraph_resource_handle_t* handle) int test_all_pairs_cosine_with_start_vertices(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 0; + size_t topk = 0; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; - vertex_t h_start_vertices[] = {0, 1, 2}; - weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; + vertex_t h_start_vertices[] = {0, 1, 2}; + weight_t h_result[] = { + 0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5}; return generic_similarity_test(handle, h_src, @@ -857,19 +874,19 @@ int test_all_pairs_cosine_with_start_vertices(const cugraph_resource_handle_t* h int test_all_pairs_cosine_with_topk(const cugraph_resource_handle_t* handle) { - size_t num_edges = 16; - size_t num_vertices = 6; - size_t num_pairs = 0; + size_t num_edges = 16; + size_t num_vertices = 6; + size_t num_pairs = 0; size_t num_start_vertices = 3; - size_t topk = 5; + size_t topk = 5; - vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; - vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; - weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; - vertex_t* h_first = NULL; - vertex_t* h_second = NULL; + vertex_t h_src[] = {0, 1, 1, 2, 2, 2, 3, 4, 1, 3, 4, 0, 1, 3, 5, 5}; + vertex_t h_dst[] = {1, 3, 4, 0, 1, 3, 5, 5, 0, 1, 1, 2, 2, 2, 3, 4}; + weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; + vertex_t* h_first = NULL; + vertex_t* h_second = NULL; vertex_t* h_start_vertices = NULL; - weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; + weight_t h_result[] = {1.0, 1.0, 1.0, 1.0, 1.0}; return generic_similarity_test(handle, h_src, @@ -889,31 +906,6 @@ int test_all_pairs_cosine_with_topk(const cugraph_resource_handle_t* handle) ALL_PAIRS_COSINE); } - - - - - - - - - - - - - - - - - - - - - - - - - /******************************************************************************/ int main(int argc, char** argv) diff --git a/cpp/tests/c_api/similarity_test.c b/cpp/tests/c_api/similarity_test.c index 0d544ff82d6..70e0cb6fb95 100644 --- a/cpp/tests/c_api/similarity_test.c +++ b/cpp/tests/c_api/similarity_test.c @@ -26,7 +26,7 @@ typedef int32_t vertex_t; typedef int32_t edge_t; typedef float weight_t; -typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE} similarity_t; +typedef enum { JACCARD, SORENSEN, OVERLAP, COSINE } similarity_t; int generic_similarity_test(vertex_t* h_src, vertex_t* h_dst, @@ -706,7 +706,9 @@ int test_all_pairs_cosine() weight_t h_wgt[] = {0.1f, 2.1f, 1.1f, 5.1f, 3.1f, 4.1f, 7.2f, 3.2f}; vertex_t h_first[] = {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5}; vertex_t h_second[] = {1, 2, 3, 4, 0, 2, 3, 5, 0, 1, 3, 4, 5, 0, 1, 2, 4, 0, 2, 3, 1, 2}; - weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; + weight_t h_result[] = {0.5, 0.5, 1.0, 0.5, 0.5, 0.666667, 0.333333, 1.0, + 0.5, 0.666667, 0.333333, 0.5, 0.5, 1.0, 0.333333, 0.333333, + 1.0, 0.5, 0.5, 1.0, 1.0, 0.5}; return generic_all_pairs_similarity_test(h_src, h_dst, @@ -964,8 +966,6 @@ int test_all_pairs_cosine_topk() COSINE); } - - int test_weighted_all_pairs_cosine() { size_t num_edges = 16; @@ -1016,7 +1016,7 @@ int test_weighted_all_pairs_cosine() int main(int argc, char** argv) { int result = 0; - + result |= RUN_TEST(test_jaccard); result |= RUN_TEST(test_sorensen); result |= RUN_TEST(test_overlap); diff --git a/python/cugraph/cugraph/__init__.py b/python/cugraph/cugraph/__init__.py index 9cd8d32eb3c..ada1fec74cb 100644 --- a/python/cugraph/cugraph/__init__.py +++ b/python/cugraph/cugraph/__init__.py @@ -85,7 +85,7 @@ all_pairs_sorensen, cosine, cosine_coefficient, - all_pairs_cosine + all_pairs_cosine, ) from cugraph.traversal import ( diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py index 4bd341d00fc..d10abbf9976 100644 --- a/python/cugraph/cugraph/dask/link_prediction/cosine.py +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -198,10 +198,11 @@ def cosine(input_graph, vertex_pair=None, use_weight=False): def all_pairs_cosine( - input_graph, - vertices: cudf.Series = None, - use_weight: bool = False, - topk: int = None): + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): """ Compute the All Pairs Cosine similarity between all pairs of vertices specified. All pairs Cosine similarity is defined between two sets as the ratio of the volume @@ -239,7 +240,7 @@ def all_pairs_cosine( Flag to indicate whether to compute weighted cosine (if use_weight==True) or un-weighted cosine (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -278,13 +279,11 @@ def all_pairs_cosine( ) if not isinstance(vertices, (dask_cudf.Series)): - vertices = dask_cudf.from_cudf( - vertices, npartitions=get_n_workers() - ) + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) if input_graph.renumbered: vertices = input_graph.lookup_internal_vertex_id(vertices) - + n_workers = get_n_workers() vertices = vertices.repartition(npartitions=n_workers) vertices = persist_dask_df_equal_parts_per_worker(vertices, client) diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 85c2edab2cf..8c6c94a144f 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -196,10 +196,11 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): def all_pairs_jaccard( - input_graph, - vertices: cudf.Series = None, - use_weight: bool = False, - topk: int = None): + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): """ Compute the All Pairs Jaccard similarity between all pairs of vertices specified. All pairs Jaccard similarity is defined between two sets as the ratio of the volume @@ -237,7 +238,7 @@ def all_pairs_jaccard( Flag to indicate whether to compute weighted jaccard (if use_weight==True) or un-weighted jaccard (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -276,13 +277,11 @@ def all_pairs_jaccard( ) if not isinstance(vertices, (dask_cudf.Series)): - vertices = dask_cudf.from_cudf( - vertices, npartitions=get_n_workers() - ) + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) if input_graph.renumbered: vertices = input_graph.lookup_internal_vertex_id(vertices) - + n_workers = get_n_workers() vertices = vertices.repartition(npartitions=n_workers) vertices = persist_dask_df_equal_parts_per_worker(vertices, client) diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index 202d148937d..a27eb6eb98d 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -198,10 +198,11 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): def all_pairs_overlap( - input_graph, - vertices: cudf.Series = None, - use_weight: bool = False, - topk: int = None): + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): """ Compute the All Pairs Overlap similarity between all pairs of vertices specified. All pairs Overlap Coefficient is defined between two sets as the ratio of the volume @@ -239,7 +240,7 @@ def all_pairs_overlap( Flag to indicate whether to compute weighted overlap (if use_weight==True) or un-weighted overlap (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -278,13 +279,11 @@ def all_pairs_overlap( ) if not isinstance(vertices, (dask_cudf.Series)): - vertices = dask_cudf.from_cudf( - vertices, npartitions=get_n_workers() - ) + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) if input_graph.renumbered: vertices = input_graph.lookup_internal_vertex_id(vertices) - + n_workers = get_n_workers() vertices = vertices.repartition(npartitions=n_workers) vertices = persist_dask_df_equal_parts_per_worker(vertices, client) @@ -323,4 +322,4 @@ def all_pairs_overlap( ddf = input_graph.unrenumber(ddf, "first") ddf = input_graph.unrenumber(ddf, "second") - return ddf \ No newline at end of file + return ddf diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 728903327bb..529d1df1ef7 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -194,10 +194,11 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): def all_pairs_sorensen( - input_graph, - vertices: cudf.Series = None, - use_weight: bool = False, - topk: int = None): + input_graph, + vertices: cudf.Series = None, + use_weight: bool = False, + topk: int = None, +): """ Compute the All Pairs Sorensen similarity between all pairs of vertices specified. All pairs Sorensen coefficient is defined between two sets as the ratio of twice the @@ -235,7 +236,7 @@ def all_pairs_sorensen( Flag to indicate whether to compute weighted sorensen (if use_weight==True) or un-weighted sorensen (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -274,13 +275,11 @@ def all_pairs_sorensen( ) if not isinstance(vertices, (dask_cudf.Series)): - vertices = dask_cudf.from_cudf( - vertices, npartitions=get_n_workers() - ) + vertices = dask_cudf.from_cudf(vertices, npartitions=get_n_workers()) if input_graph.renumbered: vertices = input_graph.lookup_internal_vertex_id(vertices) - + n_workers = get_n_workers() vertices = vertices.repartition(npartitions=n_workers) vertices = persist_dask_df_equal_parts_per_worker(vertices, client) diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py index 297fa15d336..9c7cdd2cfbf 100644 --- a/python/cugraph/cugraph/link_prediction/cosine.py +++ b/python/cugraph/cugraph/link_prediction/cosine.py @@ -235,11 +235,12 @@ def cosine_coefficient( return df + def all_pairs_cosine( input_graph: Graph, vertices: cudf.Series = None, use_weight: bool = False, - topk: int = None + topk: int = None, ): """ Compute the All Pairs Cosine similarity between all pairs of vertices specified. @@ -274,7 +275,7 @@ def all_pairs_cosine( Flag to indicate whether to compute weighted cosine (if use_weight==True) or un-weighted cosine (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -314,8 +315,9 @@ def all_pairs_cosine( if isinstance(vertices, list): vertices = cudf.Series( - vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype - ) + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) if input_graph.renumbered is True: if isinstance(vertices, cudf.DataFrame): @@ -325,8 +327,6 @@ def all_pairs_cosine( else: vertices = input_graph.lookup_internal_vertex_id(vertices) - - first, second, cosine_coeff = pylibcugraph_all_pairs_cosine_coefficients( resource_handle=ResourceHandle(), graph=input_graph._plc_graph, @@ -340,13 +340,8 @@ def all_pairs_cosine( vertex_pair["second"] = second if input_graph.renumbered: - vertex_pair = input_graph.unrenumber( - vertex_pair, "first", preserve_order=True - ) - vertex_pair = input_graph.unrenumber( - vertex_pair, "second", preserve_order=True - ) - + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) df = vertex_pair df["cosine_coeff"] = cudf.Series(cosine_coeff) diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 87585f76d10..5db28e5a33b 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -240,11 +240,12 @@ def jaccard_coefficient( return df + def all_pairs_jaccard( input_graph: Graph, vertices: cudf.Series = None, use_weight: bool = False, - topk: int = None + topk: int = None, ): """ Compute the All Pairs Jaccard similarity between all pairs of vertices specified. @@ -282,7 +283,7 @@ def all_pairs_jaccard( Flag to indicate whether to compute weighted jaccard (if use_weight==True) or un-weighted jaccard (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -322,8 +323,9 @@ def all_pairs_jaccard( if isinstance(vertices, list): vertices = cudf.Series( - vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype - ) + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) if input_graph.renumbered is True: if isinstance(vertices, cudf.DataFrame): @@ -333,8 +335,6 @@ def all_pairs_jaccard( else: vertices = input_graph.lookup_internal_vertex_id(vertices) - - first, second, jaccard_coeff = pylibcugraph_all_pairs_jaccard_coefficients( resource_handle=ResourceHandle(), graph=input_graph._plc_graph, @@ -348,13 +348,8 @@ def all_pairs_jaccard( vertex_pair["second"] = second if input_graph.renumbered: - vertex_pair = input_graph.unrenumber( - vertex_pair, "first", preserve_order=True - ) - vertex_pair = input_graph.unrenumber( - vertex_pair, "second", preserve_order=True - ) - + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) df = vertex_pair df["jaccard_coeff"] = cudf.Series(jaccard_coeff) diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index 48357900e16..5f6f74dba59 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -278,7 +278,7 @@ def all_pairs_overlap( input_graph: Graph, vertices: cudf.Series = None, use_weight: bool = False, - topk: int = None + topk: int = None, ): """ Compute the All Pairs Overlap Coefficient between each pair of vertices connected @@ -318,7 +318,7 @@ def all_pairs_overlap( Flag to indicate whether to compute weighted overlap (if use_weight==True) or un-weighted overlap (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -358,8 +358,9 @@ def all_pairs_overlap( if isinstance(vertices, list): vertices = cudf.Series( - vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype - ) + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) if input_graph.renumbered is True: if isinstance(vertices, cudf.DataFrame): @@ -369,8 +370,6 @@ def all_pairs_overlap( else: vertices = input_graph.lookup_internal_vertex_id(vertices) - - first, second, overlap_coeff = pylibcugraph_all_pairs_overlap_coefficients( resource_handle=ResourceHandle(), graph=input_graph._plc_graph, @@ -384,15 +383,10 @@ def all_pairs_overlap( vertex_pair["second"] = second if input_graph.renumbered: - vertex_pair = input_graph.unrenumber( - vertex_pair, "first", preserve_order=True - ) - vertex_pair = input_graph.unrenumber( - vertex_pair, "second", preserve_order=True - ) - + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) df = vertex_pair df["overlap_coeff"] = cudf.Series(overlap_coeff) - return df \ No newline at end of file + return df diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 085e11398fe..584fe0dcbc9 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -277,7 +277,7 @@ def all_pairs_sorensen( input_graph: Graph, vertices: cudf.Series = None, use_weight: bool = False, - topk: int = None + topk: int = None, ): """ Compute All Pairs the Sorensen coefficient between each pair of vertices connected @@ -315,7 +315,7 @@ def all_pairs_sorensen( Flag to indicate whether to compute weighted sorensen (if use_weight==True) or un-weighted sorensen (if use_weight==False). 'input_graph' must be weighted if 'use_weight=True'. - + topk : int, optional (default=None) Specify the number of answers to return otherwise returns the entire solution @@ -355,8 +355,9 @@ def all_pairs_sorensen( if isinstance(vertices, list): vertices = cudf.Series( - vertices, dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype - ) + vertices, + dtype=input_graph.edgelist.edgelist_df[input_graph.srcCol].dtype, + ) if input_graph.renumbered is True: if isinstance(vertices, cudf.DataFrame): @@ -366,8 +367,6 @@ def all_pairs_sorensen( else: vertices = input_graph.lookup_internal_vertex_id(vertices) - - first, second, sorensen_coeff = pylibcugraph_all_pairs_sorensen_coefficients( resource_handle=ResourceHandle(), graph=input_graph._plc_graph, @@ -381,13 +380,8 @@ def all_pairs_sorensen( vertex_pair["second"] = second if input_graph.renumbered: - vertex_pair = input_graph.unrenumber( - vertex_pair, "first", preserve_order=True - ) - vertex_pair = input_graph.unrenumber( - vertex_pair, "second", preserve_order=True - ) - + vertex_pair = input_graph.unrenumber(vertex_pair, "first", preserve_order=True) + vertex_pair = input_graph.unrenumber(vertex_pair, "second", preserve_order=True) df = vertex_pair df["sorensen_coeff"] = cudf.Series(sorensen_coeff) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py index 88d292dec76..f85508cb089 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_cosine_mg.py @@ -64,7 +64,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters @@ -150,7 +160,7 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - + if has_topk: topk = 5 else: @@ -160,7 +170,10 @@ def input_expected_output_all_pairs(input_combo): print("vertices ", vertices, " is_weighted = ", is_weighted) input_combo["topk"] = topk sg_cugraph_all_pairs_cosine = cugraph.all_pairs_cosine( - G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for @@ -235,16 +248,20 @@ def test_dask_mg_cosine(dask_client, benchmark, input_expected_output): @pytest.mark.mg -def test_dask_mg_all_pairs_cosine(dask_client, benchmark, input_expected_output_all_pairs): +def test_dask_mg_all_pairs_cosine( + dask_client, benchmark, input_expected_output_all_pairs +): dg = input_expected_output_all_pairs["MGGraph"] - use_weight = input_expected_output_all_pairs["is_weighted"] - result_cosine = benchmark( - dcg.all_pairs_cosine, dg, vertices=input_expected_output_all_pairs["vertices"], topk=input_expected_output_all_pairs["topk"], use_weight=use_weight + dcg.all_pairs_cosine, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, ) result_cosine = ( diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py index 1f5e811f291..34ee72e799b 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard.py @@ -350,13 +350,20 @@ def test_all_pairs_jaccard(): # Call Jaccard jaccard_results = cugraph.jaccard(G) - + # Remove self loop - jaccard_results = jaccard_results[jaccard_results['first'] != jaccard_results['second']].reset_index(drop=True) - + jaccard_results = jaccard_results[ + jaccard_results["first"] != jaccard_results["second"] + ].reset_index(drop=True) + all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G) - assert_frame_equal(jaccard_results.head(), all_pairs_jaccard_results.head(), check_dtype=False, check_like=True) + assert_frame_equal( + jaccard_results.head(), + all_pairs_jaccard_results.head(), + check_dtype=False, + check_like=True, + ) # FIXME @@ -368,23 +375,30 @@ def test_all_pairs_jaccard_with_vertices(): # Call Jaccard jaccard_results = cugraph.jaccard(G) - + # Remove self loop - jaccard_results = jaccard_results[jaccard_results['first'] != jaccard_results['second']].reset_index(drop=True) + jaccard_results = jaccard_results[ + jaccard_results["first"] != jaccard_results["second"] + ].reset_index(drop=True) vertices = [0, 1, 2] - mask_first = jaccard_results['first'].isin(vertices) - mask_second = jaccard_results['second'].isin(vertices) - # mask = [v in vertices for v in (jaccard_results['first'].to_pandas() or jaccard_results['second'].to_pandas())] + mask_first = jaccard_results["first"].isin(vertices) + mask_second = jaccard_results["second"].isin(vertices) + # mask = [v in vertices for v in (jaccard_results['first'].to_pandas() + # or jaccard_results['second'].to_pandas())] mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] jaccard_results = jaccard_results[mask].reset_index(drop=True) # Call all-pairs Jaccard - all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G, vertices=cudf.Series(vertices, dtype="int32")) + all_pairs_jaccard_results = cugraph.all_pairs_jaccard( + G, vertices=cudf.Series(vertices, dtype="int32") + ) - assert_frame_equal(jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True) + assert_frame_equal( + jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True + ) @pytest.mark.sg @@ -396,12 +410,21 @@ def test_all_pairs_jaccard_with_topk(): jaccard_results = cugraph.jaccard(G) topk = 4 - + # Remove self loop - jaccard_results = jaccard_results[jaccard_results['first'] != jaccard_results['second']].\ - sort_values(["jaccard_coeff", "first", "second"], ascending=False).reset_index(drop=True)[:topk] + jaccard_results = ( + jaccard_results[jaccard_results["first"] != jaccard_results["second"]] + .sort_values(["jaccard_coeff", "first", "second"], ascending=False) + .reset_index(drop=True)[:topk] + ) # Call all-pairs Jaccard - all_pairs_jaccard_results = cugraph.all_pairs_jaccard(G, topk=topk).sort_values(["first", "second"], ascending=False).reset_index(drop=True) + all_pairs_jaccard_results = ( + cugraph.all_pairs_jaccard(G, topk=topk) + .sort_values(["first", "second"], ascending=False) + .reset_index(drop=True) + ) - assert_frame_equal(jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True) + assert_frame_equal( + jaccard_results, all_pairs_jaccard_results, check_dtype=False, check_like=True + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py index 063a9aa00b0..d907a0dfff2 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py @@ -64,7 +64,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters @@ -150,7 +160,7 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - + if has_topk: topk = 5 else: @@ -160,7 +170,10 @@ def input_expected_output_all_pairs(input_combo): print("vertices ", vertices, " is_weighted = ", is_weighted) input_combo["topk"] = topk sg_cugraph_all_pairs_jaccard = cugraph.all_pairs_jaccard( - G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for @@ -235,16 +248,20 @@ def test_dask_mg_jaccard(dask_client, benchmark, input_expected_output): @pytest.mark.mg -def test_dask_mg_all_pairs_jaccard(dask_client, benchmark, input_expected_output_all_pairs): +def test_dask_mg_all_pairs_jaccard( + dask_client, benchmark, input_expected_output_all_pairs +): dg = input_expected_output_all_pairs["MGGraph"] - use_weight = input_expected_output_all_pairs["is_weighted"] - result_jaccard = benchmark( - dcg.all_pairs_jaccard, dg, vertices=input_expected_output_all_pairs["vertices"], topk=input_expected_output_all_pairs["topk"], use_weight=use_weight + dcg.all_pairs_jaccard, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, ) result_jaccard = ( diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index b864f0dbbdf..9999e994061 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -251,13 +251,20 @@ def test_all_pairs_overlap(): # Call Overlap overlap_results = cugraph.overlap(G) - + # Remove self loop - overlap_results = overlap_results[overlap_results['first'] != overlap_results['second']].reset_index(drop=True) - + overlap_results = overlap_results[ + overlap_results["first"] != overlap_results["second"] + ].reset_index(drop=True) + all_pairs_overlap_results = cugraph.all_pairs_overlap(G) - assert_frame_equal(overlap_results.head(), all_pairs_overlap_results.head(), check_dtype=False, check_like=True) + assert_frame_equal( + overlap_results.head(), + all_pairs_overlap_results.head(), + check_dtype=False, + check_like=True, + ) # FIXME @@ -269,23 +276,30 @@ def test_all_pairs_overlap_with_vertices(): # Call Overlap overlap_results = cugraph.overlap(G) - + # Remove self loop - overlap_results = overlap_results[overlap_results['first'] != overlap_results['second']].reset_index(drop=True) + overlap_results = overlap_results[ + overlap_results["first"] != overlap_results["second"] + ].reset_index(drop=True) vertices = [0, 1, 2] - mask_first = overlap_results['first'].isin(vertices) - mask_second = overlap_results['second'].isin(vertices) - # mask = [v in vertices for v in (overlap_results['first'].to_pandas() or overlap_results['second'].to_pandas())] + mask_first = overlap_results["first"].isin(vertices) + mask_second = overlap_results["second"].isin(vertices) + # mask = [v in vertices for v in (overlap_results['first'].to_pandas() + # or overlap_results['second'].to_pandas())] mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] overlap_results = overlap_results[mask].reset_index(drop=True) # Call all-pairs Overlap - all_pairs_overlap_results = cugraph.all_pairs_overlap(G, vertices=cudf.Series(vertices, dtype="int32")) + all_pairs_overlap_results = cugraph.all_pairs_overlap( + G, vertices=cudf.Series(vertices, dtype="int32") + ) - assert_frame_equal(overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True) + assert_frame_equal( + overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True + ) @pytest.mark.sg @@ -297,12 +311,21 @@ def test_all_pairs_overlap_with_topk(): overlap_results = cugraph.overlap(G) topk = 4 - + # Remove self loop - overlap_results = overlap_results[overlap_results['first'] != overlap_results['second']].\ - sort_values(["overlap_coeff", "first", "second"], ascending=False).reset_index(drop=True)[:topk] + overlap_results = ( + overlap_results[overlap_results["first"] != overlap_results["second"]] + .sort_values(["overlap_coeff", "first", "second"], ascending=False) + .reset_index(drop=True)[:topk] + ) # Call all-pairs overlap - all_pairs_overlap_results = cugraph.all_pairs_overlap(G, topk=topk).sort_values(["first", "second"], ascending=False).reset_index(drop=True) + all_pairs_overlap_results = ( + cugraph.all_pairs_overlap(G, topk=topk) + .sort_values(["first", "second"], ascending=False) + .reset_index(drop=True) + ) - assert_frame_equal(overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True) + assert_frame_equal( + overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py index 77aabea868b..3793ceb4b93 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py @@ -150,7 +150,7 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - + if has_topk: topk = 5 else: @@ -159,7 +159,10 @@ def input_expected_output_all_pairs(input_combo): input_combo["vertices"] = vertices input_combo["topk"] = topk sg_cugraph_all_pairs_overlap = cugraph.all_pairs_overlap( - G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for @@ -237,16 +240,20 @@ def test_dask_mg_overlap(dask_client, benchmark, input_expected_output): @pytest.mark.mg -def test_dask_mg_all_pairs_overlap(dask_client, benchmark, input_expected_output_all_pairs): +def test_dask_mg_all_pairs_overlap( + dask_client, benchmark, input_expected_output_all_pairs +): dg = input_expected_output_all_pairs["MGGraph"] - use_weight = input_expected_output_all_pairs["is_weighted"] - result_overlap = benchmark( - dcg.all_pairs_overlap, dg, vertices=input_expected_output_all_pairs["vertices"], topk=input_expected_output_all_pairs["topk"], use_weight=use_weight + dcg.all_pairs_overlap, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, ) result_overlap = ( diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 0c10fe08b90..e7ac5202454 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -346,13 +346,20 @@ def test_all_pairs_sorensen(): # Call Sorensen sorensen_results = cugraph.sorensen(G) - + # Remove self loop - sorensen_results = sorensen_results[sorensen_results['first'] != sorensen_results['second']].reset_index(drop=True) - + sorensen_results = sorensen_results[ + sorensen_results["first"] != sorensen_results["second"] + ].reset_index(drop=True) + all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G) - assert_frame_equal(sorensen_results.head(), all_pairs_sorensen_results.head(), check_dtype=False, check_like=True) + assert_frame_equal( + sorensen_results.head(), + all_pairs_sorensen_results.head(), + check_dtype=False, + check_like=True, + ) # FIXME @@ -364,23 +371,30 @@ def test_all_pairs_sorensen_with_vertices(): # Call Sorensen sorensen_results = cugraph.sorensen(G) - + # Remove self loop - sorensen_results = sorensen_results[sorensen_results['first'] != sorensen_results['second']].reset_index(drop=True) + sorensen_results = sorensen_results[ + sorensen_results["first"] != sorensen_results["second"] + ].reset_index(drop=True) vertices = [0, 1, 2] - mask_first = sorensen_results['first'].isin(vertices) - mask_second = sorensen_results['second'].isin(vertices) - # mask = [v in vertices for v in (sorensen_results['first'].to_pandas() or sorensen_results['second'].to_pandas())] + mask_first = sorensen_results["first"].isin(vertices) + mask_second = sorensen_results["second"].isin(vertices) + # mask = [v in vertices for v in (sorensen_results['first'].to_pandas() + # or sorensen_results['second'].to_pandas())] mask = [f or s for (f, s) in zip(mask_first.to_pandas(), mask_second.to_pandas())] sorensen_results = sorensen_results[mask].reset_index(drop=True) # Call all-pairs Sorensen - all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G, vertices=cudf.Series(vertices, dtype="int32")) + all_pairs_sorensen_results = cugraph.all_pairs_sorensen( + G, vertices=cudf.Series(vertices, dtype="int32") + ) - assert_frame_equal(sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True) + assert_frame_equal( + sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True + ) @pytest.mark.sg @@ -392,12 +406,21 @@ def test_all_pairs_sorensen_with_topk(): sorensen_results = cugraph.sorensen(G) topk = 4 - + # Remove self loop - sorensen_results = sorensen_results[sorensen_results['first'] != sorensen_results['second']].\ - sort_values(["sorensen_coeff", "first", "second"], ascending=False).reset_index(drop=True)[:topk] + sorensen_results = ( + sorensen_results[sorensen_results["first"] != sorensen_results["second"]] + .sort_values(["sorensen_coeff", "first", "second"], ascending=False) + .reset_index(drop=True)[:topk] + ) # Call all-pairs sorensen - all_pairs_sorensen_results = cugraph.all_pairs_sorensen(G, topk=topk).sort_values(["first", "second"], ascending=False).reset_index(drop=True) + all_pairs_sorensen_results = ( + cugraph.all_pairs_sorensen(G, topk=topk) + .sort_values(["first", "second"], ascending=False) + .reset_index(drop=True) + ) - assert_frame_equal(sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True) + assert_frame_equal( + sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py index ac39ed1cbc6..c4b4eae65d9 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py @@ -63,7 +63,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters @@ -149,7 +159,7 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - + if has_topk: topk = 5 else: @@ -159,7 +169,10 @@ def input_expected_output_all_pairs(input_combo): print("vertices ", vertices, " is_weighted = ", is_weighted) input_combo["topk"] = topk sg_cugraph_all_pairs_sorensen = cugraph.all_pairs_sorensen( - G, vertices=input_combo["vertices"], topk=input_combo["topk"], use_weight=is_weighted + G, + vertices=input_combo["vertices"], + topk=input_combo["topk"], + use_weight=is_weighted, ) # Save the results back to the input_combo dictionary to prevent redundant # cuGraph runs. Other tests using the input_combo fixture will look for diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index e969afee76f..71d094a6058 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -53,7 +53,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_free( cugraph_similarity_result_t* result ) - + ########################################################################### # jaccard coefficients cdef cugraph_error_code_t \ @@ -66,7 +66,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) - + ########################################################################### # all-pairs jaccard coefficients cdef cugraph_error_code_t \ @@ -93,7 +93,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) - + ########################################################################### # all-pairs sorensen coefficients cdef cugraph_error_code_t \ @@ -120,7 +120,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) - + ########################################################################### # all-pairs overlap coefficients cdef cugraph_error_code_t \ @@ -134,7 +134,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) - + ########################################################################### # cosine coefficients cdef cugraph_error_code_t \ @@ -147,7 +147,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) - + ########################################################################### # all-pairs cosine coefficients cdef cugraph_error_code_t \ diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx index 0bf92b01614..b600dd48567 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_cosine_coefficients.pyx @@ -87,7 +87,7 @@ def all_pairs_cosine_coefficients(ResourceHandle resource_handle, If set to True, then compute weighted cosine_coefficients( the input graph must be weighted in that case). Otherwise, compute non-weighted cosine_coefficients - + topk : size_t Specify the number of answers to return otherwise will return all values. @@ -142,7 +142,7 @@ def all_pairs_cosine_coefficients(ResourceHandle resource_handle, cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ cugraph_similarity_result_get_vertex_pairs(result_ptr) - + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx index 70e9846bb75..b65905b6850 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx @@ -87,7 +87,7 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, If set to True, then compute weighted jaccard_coefficients( the input graph must be weighted in that case). Otherwise, compute non-weighted jaccard_coefficients - + topk : size_t Specify the number of answers to return otherwise will return all values. @@ -142,7 +142,7 @@ def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ cugraph_similarity_result_get_vertex_pairs(result_ptr) - + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx index 95fc99a7dd2..74f3bc06a94 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx @@ -87,7 +87,7 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, If set to True, then compute weighted overlap_coefficients( the input graph must be weighted in that case). Otherwise, compute non-weighted overlap_coefficients - + topk : size_t Specify the number of answers to return otherwise will return all values. @@ -142,7 +142,7 @@ def all_pairs_overlap_coefficients(ResourceHandle resource_handle, cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ cugraph_similarity_result_get_vertex_pairs(result_ptr) - + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx index c5762271776..5e3fc24a4b4 100644 --- a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx +++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx @@ -87,7 +87,7 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, If set to True, then compute weighted sorensen_coefficients( the input graph must be weighted in that case). Otherwise, compute non-weighted sorensen_coefficients - + topk : size_t Specify the number of answers to return otherwise will return all values. @@ -142,7 +142,7 @@ def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, cdef cugraph_vertex_pairs_t* vertex_pairs_ptr = \ cugraph_similarity_result_get_vertex_pairs(result_ptr) - + cdef cugraph_type_erased_device_array_view_t* first_view_ptr = \ cugraph_vertex_pairs_get_first(vertex_pairs_ptr) From 89e4da1a930a772301a7768e5290214fd48a75b5 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 11:48:07 -0700 Subject: [PATCH 22/33] update tests for python SG similarity algos --- .../tests/link_prediction/test_overlap.py | 65 +++++++++++++++++-- .../tests/link_prediction/test_sorensen.py | 61 ++++++++++++++++- 2 files changed, 120 insertions(+), 6 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index 9999e994061..f4defb971f5 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -21,6 +21,7 @@ import cugraph from cugraph.testing import utils, UNDIRECTED_DATASETS from cudf.testing import assert_series_equal, assert_frame_equal +import pandas as pd SRC_COL = "0" DST_COL = "1" @@ -113,6 +114,49 @@ def cpu_call(M, first, second): result.append(overlap(first[i], second[i], M)) return result +def compare(src1, dst1, val1, src2, dst2, val2): + # + # We will do comparison computations by using dataframe + # merge functions (essentially doing fast joins). We + # start by making two data frames + # + df1 = cudf.DataFrame() + df1["src1"] = src1 + df1["dst1"] = dst1 + if val1 is not None: + df1["val1"] = val1 + + df2 = cudf.DataFrame() + df2["src2"] = src2 + df2["dst2"] = dst2 + if val2 is not None: + df2["val2"] = val2 + + # + # Check to see if all pairs in the original data frame + # still exist in the new data frame. If we join (merge) + # the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i]) + # then we should get exactly the same number of entries in + # the data frame if we did not lose any data. + # + join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"]) + + if len(df1) != len(join): + join2 = df1.merge( + df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"] + ) + pd.set_option("display.max_rows", 500) + print("df1 = \n", df1.sort_values(["src1", "dst1"])) + print("df2 = \n", df2.sort_values(["src2", "dst2"])) + print( + "join2 = \n", + join2.sort_values(["src1", "dst1"]) + .to_pandas() + .query("src2.isnull()", engine="python"), + ) + + assert len(df1) == len(join) + # ============================================================================= # Pytest Fixtures @@ -310,14 +354,15 @@ def test_all_pairs_overlap_with_topk(): # Call Overlap overlap_results = cugraph.overlap(G) - topk = 4 + topk = 10 # Remove self loop overlap_results = ( overlap_results[overlap_results["first"] != overlap_results["second"]] .sort_values(["overlap_coeff", "first", "second"], ascending=False) - .reset_index(drop=True)[:topk] + .reset_index(drop=True)#[:topk] ) + print("overlap_results = \n", overlap_results) # Call all-pairs overlap all_pairs_overlap_results = ( @@ -326,6 +371,18 @@ def test_all_pairs_overlap_with_topk(): .reset_index(drop=True) ) - assert_frame_equal( - overlap_results, all_pairs_overlap_results, check_dtype=False, check_like=True + # 1. All pair similarity might return different top pairs k pairs + # which are still valid hence, ensure the pairs returned by all-pairs + # exists. + + compare( + all_pairs_overlap_results["first"], + all_pairs_overlap_results["second"], + all_pairs_overlap_results["overlap_coeff"], + overlap_results["first"], + overlap_results["second"], + overlap_results["overlap_coeff"], ) + + # 2. Ensure the coefficient scores are still the highest + assert_series_equal(all_pairs_overlap_results["overlap_coeff"], overlap_results["overlap_coeff"][:topk]) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index e7ac5202454..7f5860a574d 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -15,6 +15,7 @@ import pytest import networkx as nx +import pandas as pd import cudf import cugraph @@ -156,6 +157,50 @@ def networkx_call(M, benchmark_callable=None): return src, dst, coeff +def compare(src1, dst1, val1, src2, dst2, val2): + # + # We will do comparison computations by using dataframe + # merge functions (essentially doing fast joins). We + # start by making two data frames + # + df1 = cudf.DataFrame() + df1["src1"] = src1 + df1["dst1"] = dst1 + if val1 is not None: + df1["val1"] = val1 + + df2 = cudf.DataFrame() + df2["src2"] = src2 + df2["dst2"] = dst2 + if val2 is not None: + df2["val2"] = val2 + + # + # Check to see if all pairs in the original data frame + # still exist in the new data frame. If we join (merge) + # the data frames where (src1[i]=src2[i]) and (dst1[i]=dst2[i]) + # then we should get exactly the same number of entries in + # the data frame if we did not lose any data. + # + join = df1.merge(df2, left_on=["src1", "dst1"], right_on=["src2", "dst2"]) + + if len(df1) != len(join): + join2 = df1.merge( + df2, how="left", left_on=["src1", "dst1"], right_on=["src2", "dst2"] + ) + pd.set_option("display.max_rows", 500) + print("df1 = \n", df1.sort_values(["src1", "dst1"])) + print("df2 = \n", df2.sort_values(["src2", "dst2"])) + print( + "join2 = \n", + join2.sort_values(["src1", "dst1"]) + .to_pandas() + .query("src2.isnull()", engine="python"), + ) + + assert len(df1) == len(join) + + # ============================================================================= # Pytest Fixtures # ============================================================================= @@ -421,6 +466,18 @@ def test_all_pairs_sorensen_with_topk(): .reset_index(drop=True) ) - assert_frame_equal( - sorensen_results, all_pairs_sorensen_results, check_dtype=False, check_like=True + # 1. All pair similarity might return different top pairs k pairs + # which are still valid hence, ensure the pairs returned by all-pairs + # exists. + + compare( + all_pairs_sorensen_results["first"], + all_pairs_sorensen_results["second"], + all_pairs_sorensen_results["sorensen_coeff"], + sorensen_results["first"], + sorensen_results["second"], + sorensen_results["sorensen_coeff"], ) + + # 2. Ensure the coefficient scores are still the highest + assert_series_equal(all_pairs_sorensen_results["sorensen_coeff"], sorensen_results["sorensen_coeff"][:topk]) \ No newline at end of file From 40eb10e2a92e229ad70b80278f3e2ef2f84b4dab Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 11:50:21 -0700 Subject: [PATCH 23/33] fix style --- .../cugraph/cugraph/tests/link_prediction/test_overlap.py | 8 ++++++-- .../cugraph/tests/link_prediction/test_sorensen.py | 5 ++++- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py index f4defb971f5..f87fe06f691 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap.py @@ -114,6 +114,7 @@ def cpu_call(M, first, second): result.append(overlap(first[i], second[i], M)) return result + def compare(src1, dst1, val1, src2, dst2, val2): # # We will do comparison computations by using dataframe @@ -360,7 +361,7 @@ def test_all_pairs_overlap_with_topk(): overlap_results = ( overlap_results[overlap_results["first"] != overlap_results["second"]] .sort_values(["overlap_coeff", "first", "second"], ascending=False) - .reset_index(drop=True)#[:topk] + .reset_index(drop=True) # [:topk] ) print("overlap_results = \n", overlap_results) @@ -385,4 +386,7 @@ def test_all_pairs_overlap_with_topk(): ) # 2. Ensure the coefficient scores are still the highest - assert_series_equal(all_pairs_overlap_results["overlap_coeff"], overlap_results["overlap_coeff"][:topk]) + assert_series_equal( + all_pairs_overlap_results["overlap_coeff"], + overlap_results["overlap_coeff"][:topk], + ) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py index 7f5860a574d..4c30f149ea5 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen.py @@ -480,4 +480,7 @@ def test_all_pairs_sorensen_with_topk(): ) # 2. Ensure the coefficient scores are still the highest - assert_series_equal(all_pairs_sorensen_results["sorensen_coeff"], sorensen_results["sorensen_coeff"][:topk]) \ No newline at end of file + assert_series_equal( + all_pairs_sorensen_results["sorensen_coeff"], + sorensen_results["sorensen_coeff"][:topk], + ) From 17b285e056037e3b6b1fb0c13ad5b84319eb56fc Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 12:35:06 -0700 Subject: [PATCH 24/33] fix OOM issue --- .../tests/link_prediction/test_jaccard_mg.py | 15 +++++++++------ .../tests/link_prediction/test_overlap_mg.py | 16 ++++++++++------ 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py index d907a0dfff2..244718ce927 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_jaccard_mg.py @@ -152,6 +152,12 @@ def input_expected_output_all_pairs(input_combo): G = utils.generate_cugraph_graph_from_file( input_data_path, directed=directed, edgevals=is_weighted ) + + if has_topk: + topk = 5 + else: + topk = None + if has_vertices: # Sample random vertices from the graph and compute the two_hop_neighbors # with those seeds @@ -160,15 +166,12 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - - if has_topk: - topk = 5 - else: - topk = None + # If no start_vertices are passed, all_pairs similarity runs OOM + topk = 10 input_combo["vertices"] = vertices - print("vertices ", vertices, " is_weighted = ", is_weighted) input_combo["topk"] = topk + print("vertices ", vertices) sg_cugraph_all_pairs_jaccard = cugraph.all_pairs_jaccard( G, vertices=input_combo["vertices"], diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py index 3793ceb4b93..6bc426779f7 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py @@ -64,7 +64,7 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "is_weighted"), request.param) + zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) ) return parameters @@ -142,6 +142,12 @@ def input_expected_output_all_pairs(input_combo): G = utils.generate_cugraph_graph_from_file( input_data_path, directed=directed, edgevals=is_weighted ) + + if has_topk: + topk = 5 + else: + topk = None + if has_vertices: # Sample random vertices from the graph and compute the two_hop_neighbors # with those seeds @@ -150,14 +156,12 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - - if has_topk: - topk = 5 - else: - topk = None + # If no start_vertices are passed, all_pairs similarity runs OOM + topk = 10 input_combo["vertices"] = vertices input_combo["topk"] = topk + print("vertices ", vertices) sg_cugraph_all_pairs_overlap = cugraph.all_pairs_overlap( G, vertices=input_combo["vertices"], From 6c182f43d9d661497dd9c5b6d6ab189a5802f7de Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 13:33:43 -0700 Subject: [PATCH 25/33] update tests --- .../tests/link_prediction/test_sorensen_mg.py | 60 +++++++++++++++++-- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py index c4b4eae65d9..e41daa64fb8 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_sorensen_mg.py @@ -52,6 +52,8 @@ def setup_function(): (datasets, "graph_file"), (IS_DIRECTED, "directed"), (HAS_VERTEX_PAIR, "has_vertex_pair"), + (HAS_VERTICES, "has_vertices"), + (HAS_TOPK, "has_topk"), (IS_WEIGHTED, "is_weighted"), ) @@ -151,6 +153,12 @@ def input_expected_output_all_pairs(input_combo): G = utils.generate_cugraph_graph_from_file( input_data_path, directed=directed, edgevals=is_weighted ) + + if has_topk: + topk = 5 + else: + topk = None + if has_vertices: # Sample random vertices from the graph and compute the two_hop_neighbors # with those seeds @@ -159,11 +167,8 @@ def input_expected_output_all_pairs(input_combo): else: vertices = None - - if has_topk: - topk = 5 - else: - topk = None + # If no start_vertices are passed, all_pairs similarity runs OOM + topk = 10 input_combo["vertices"] = vertices print("vertices ", vertices, " is_weighted = ", is_weighted) @@ -245,3 +250,48 @@ def test_dask_mg_sorensen(dask_client, benchmark, input_expected_output): assert len(sorensen_coeff_diffs1) == 0 assert len(sorensen_coeff_diffs2) == 0 + + +@pytest.mark.mg +def test_dask_mg_all_pairs_sorensen( + dask_client, benchmark, input_expected_output_all_pairs +): + + dg = input_expected_output_all_pairs["MGGraph"] + + use_weight = input_expected_output_all_pairs["is_weighted"] + + result_sorensen = benchmark( + dcg.all_pairs_sorensen, + dg, + vertices=input_expected_output_all_pairs["vertices"], + topk=input_expected_output_all_pairs["topk"], + use_weight=use_weight, + ) + + result_sorensen = ( + result_sorensen.compute() + .sort_values(["first", "second"]) + .reset_index(drop=True) + .rename(columns={"sorensen_coeff": "mg_cugraph_sorensen_coeff"}) + ) + + expected_output = ( + input_expected_output_all_pairs["sg_cugraph_results"] + .sort_values(["first", "second"]) + .reset_index(drop=True) + ) + + # Update the dask cugraph sorensen results with sg cugraph results for easy + # comparison using cuDF DataFrame methods. + result_sorensen["sg_cugraph_sorensen_coeff"] = expected_output["sorensen_coeff"] + + sorensen_coeff_diffs1 = result_sorensen.query( + "mg_cugraph_sorensen_coeff - sg_cugraph_sorensen_coeff > 0.00001" + ) + sorensen_coeff_diffs2 = result_sorensen.query( + "mg_cugraph_sorensen_coeff - sg_cugraph_sorensen_coeff < -0.00001" + ) + + assert len(sorensen_coeff_diffs1) == 0 + assert len(sorensen_coeff_diffs2) == 0 From ba220b6a59b541f6adef9a077b4cf0abf0a99f18 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 8 Jul 2024 13:37:26 -0700 Subject: [PATCH 26/33] fix style --- .../cugraph/tests/link_prediction/test_overlap_mg.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py index 6bc426779f7..aa238f6a6de 100644 --- a/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py +++ b/python/cugraph/cugraph/tests/link_prediction/test_overlap_mg.py @@ -64,7 +64,17 @@ def input_combo(request): tests or other parameterized fixtures. """ parameters = dict( - zip(("graph_file", "directed", "has_vertex_pair", "has_vertices", "has_topk", "is_weighted"), request.param) + zip( + ( + "graph_file", + "directed", + "has_vertex_pair", + "has_vertices", + "has_topk", + "is_weighted", + ), + request.param, + ) ) return parameters From 060fbeed2db859f70e195ecc7a8c4754078cc099 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Wed, 10 Jul 2024 06:16:51 -0700 Subject: [PATCH 27/33] update copyright --- python/pylibcugraph/pylibcugraph/utils.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pylibcugraph/pylibcugraph/utils.pxd b/python/pylibcugraph/pylibcugraph/utils.pxd index d4af1e795ae..21ab49a1f1e 100644 --- a/python/pylibcugraph/pylibcugraph/utils.pxd +++ b/python/pylibcugraph/pylibcugraph/utils.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From 338524337e2661ff5525558a032973965fa186f9 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 25 Jul 2024 08:14:23 -0700 Subject: [PATCH 28/33] update copyright --- python/cugraph/cugraph/link_prediction/cosine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py index 9c7cdd2cfbf..b8cc419d8a6 100644 --- a/python/cugraph/cugraph/link_prediction/cosine.py +++ b/python/cugraph/cugraph/link_prediction/cosine.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 22024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at From fa1c8589c811afdabce7c7dcd4d3ee1f728b41f4 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 25 Jul 2024 08:18:27 -0700 Subject: [PATCH 29/33] update docstrings to indicate that weight support --- python/cugraph/cugraph/dask/link_prediction/cosine.py | 2 +- python/cugraph/cugraph/dask/link_prediction/jaccard.py | 2 +- python/cugraph/cugraph/dask/link_prediction/overlap.py | 2 +- python/cugraph/cugraph/dask/link_prediction/sorensen.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py index d10abbf9976..d9dcb8100d0 100644 --- a/python/cugraph/cugraph/dask/link_prediction/cosine.py +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -106,7 +106,7 @@ def cosine(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 8c6c94a144f..5f520961efd 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -104,7 +104,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index a27eb6eb98d..e9c08fd6096 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -106,7 +106,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 529d1df1ef7..43a55c777fd 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -102,7 +102,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): directed edge in both direction. The adjacency list will be computed if not already present. - This implementation only supports undirected, unweighted Graph. + This implementation only supports undirected, non-multi Graphs. vertex_pair : cudf.DataFrame, optional (default=None) A GPU dataframe consisting of two columns representing pairs of From f7647e728ec31f554a0b2284b57fdc78e2ec217d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 25 Jul 2024 08:20:37 -0700 Subject: [PATCH 30/33] fix typo --- python/cugraph/cugraph/dask/link_prediction/cosine.py | 4 ++-- python/cugraph/cugraph/dask/link_prediction/jaccard.py | 4 ++-- python/cugraph/cugraph/dask/link_prediction/overlap.py | 4 ++-- python/cugraph/cugraph/dask/link_prediction/sorensen.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py index d9dcb8100d0..b1941a90986 100644 --- a/python/cugraph/cugraph/dask/link_prediction/cosine.py +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -123,7 +123,7 @@ def cosine(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). @@ -248,7 +248,7 @@ def all_pairs_cosine( Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index 5f520961efd..fada91b4789 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -121,7 +121,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). @@ -246,7 +246,7 @@ def all_pairs_jaccard( Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index e9c08fd6096..17c476d5357 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -123,7 +123,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair(will be identical to first if specified). @@ -248,7 +248,7 @@ def all_pairs_overlap( Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 43a55c777fd..9f9bfd46965 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -119,7 +119,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair(will be identical to first if specified). @@ -244,7 +244,7 @@ def all_pairs_sorensen( Returns ------- result : dask_cudf.DataFrame - GPU distributed data frame containing 2 dask_cudf.Series + GPU distributed data frame containing 3 dask_cudf.Series ddf['first']: dask_cudf.Series The first vertex ID of each pair (will be identical to first if specified). From 57f4c4b528e73406fc0f8cc8517d4355ee4fbe88 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Thu, 25 Jul 2024 08:45:38 -0700 Subject: [PATCH 31/33] update docstrings description --- .../cugraph/cugraph/dask/link_prediction/cosine.py | 14 +++++++------- .../cugraph/dask/link_prediction/jaccard.py | 4 ++-- .../cugraph/dask/link_prediction/overlap.py | 4 ++-- .../cugraph/dask/link_prediction/sorensen.py | 4 ++-- python/cugraph/cugraph/link_prediction/cosine.py | 10 ++++++++++ python/cugraph/cugraph/link_prediction/jaccard.py | 4 ++-- python/cugraph/cugraph/link_prediction/overlap.py | 4 ++-- python/cugraph/cugraph/link_prediction/sorensen.py | 6 +++--- 8 files changed, 30 insertions(+), 20 deletions(-) diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py index b1941a90986..cac80738de0 100644 --- a/python/cugraph/cugraph/dask/link_prediction/cosine.py +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -85,10 +85,10 @@ def cosine(input_graph, vertex_pair=None, use_weight=False): """ Compute the Cosine similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. - Cosine similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context - of graphs, the neighborhood of a vertex is seen as a set. The Cosine - similarity weight of each edge represents the strength of connection + Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. + The Cosine similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. cugraph.dask.cosine, in the absence of a specified vertex pair list, will @@ -205,9 +205,9 @@ def all_pairs_cosine( ): """ Compute the All Pairs Cosine similarity between all pairs of vertices specified. - All pairs Cosine similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context - of graphs, the neighborhood of a vertex is seen as a set. The Cosine + All pairs Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of their volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. The Cosine similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. diff --git a/python/cugraph/cugraph/dask/link_prediction/jaccard.py b/python/cugraph/cugraph/dask/link_prediction/jaccard.py index fada91b4789..f72122048f9 100644 --- a/python/cugraph/cugraph/dask/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/dask/link_prediction/jaccard.py @@ -84,7 +84,7 @@ def jaccard(input_graph, vertex_pair=None, use_weight=False): Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context + of their intersection over the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. @@ -204,7 +204,7 @@ def all_pairs_jaccard( """ Compute the All Pairs Jaccard similarity between all pairs of vertices specified. All pairs Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context + of their intersection over the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. diff --git a/python/cugraph/cugraph/dask/link_prediction/overlap.py b/python/cugraph/cugraph/dask/link_prediction/overlap.py index 17c476d5357..e1a3285ee60 100644 --- a/python/cugraph/cugraph/dask/link_prediction/overlap.py +++ b/python/cugraph/cugraph/dask/link_prediction/overlap.py @@ -83,7 +83,7 @@ def overlap(input_graph, vertex_pair=None, use_weight=False): Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the + of their intersection over the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap Coefficient weight of each edge represents the strength of connection between vertices based on the relative similarity of their @@ -206,7 +206,7 @@ def all_pairs_overlap( """ Compute the All Pairs Overlap similarity between all pairs of vertices specified. All pairs Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the context + of their intersection over the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. diff --git a/python/cugraph/cugraph/dask/link_prediction/sorensen.py b/python/cugraph/cugraph/dask/link_prediction/sorensen.py index 9f9bfd46965..3697385e8f8 100644 --- a/python/cugraph/cugraph/dask/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/dask/link_prediction/sorensen.py @@ -83,7 +83,7 @@ def sorensen(input_graph, vertex_pair=None, use_weight=False): Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. + volume of their intersection over the volume of each set. If first is specified but second is not, or vice versa, an exception will be thrown. @@ -202,7 +202,7 @@ def all_pairs_sorensen( """ Compute the All Pairs Sorensen similarity between all pairs of vertices specified. All pairs Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. In the context + volume of their intersection over the volume of each set. In the context of graphs, the neighborhood of a vertex is seen as a set. The Sorensen similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py index b8cc419d8a6..6d5c5dcf188 100644 --- a/python/cugraph/cugraph/link_prediction/cosine.py +++ b/python/cugraph/cugraph/link_prediction/cosine.py @@ -65,6 +65,11 @@ def cosine( """ Compute the Cosine similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. + The Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of their volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. cugraph.cosine, in the absence of a specified vertex pair list, will compute the two_hop_neighbors of the entire graph to construct a vertex pair @@ -246,6 +251,11 @@ def all_pairs_cosine( Compute the All Pairs Cosine similarity between all pairs of vertices specified. The Cosine similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. + The All Pairs Cosine similarity is defined between two sets as the ratio of their + intersection's volume over the square root of their volume's product. + In the context of graphs, the neighborhood of a vertex is seen as a set. + The Cosine similarity weight of each edge represents the strength of connection + between vertices based on the relative similarity of their neighbors. cugraph.all_pairs_cosine, in the absence of specified vertices, will compute the two_hop_neighbors of the entire graph to construct a vertex pair diff --git a/python/cugraph/cugraph/link_prediction/jaccard.py b/python/cugraph/cugraph/link_prediction/jaccard.py index 5db28e5a33b..214d92a1be5 100644 --- a/python/cugraph/cugraph/link_prediction/jaccard.py +++ b/python/cugraph/cugraph/link_prediction/jaccard.py @@ -66,7 +66,7 @@ def jaccard( Compute the Jaccard similarity between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context + of their intersection over the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. @@ -250,7 +250,7 @@ def all_pairs_jaccard( """ Compute the All Pairs Jaccard similarity between all pairs of vertices specified. All pairs Jaccard similarity is defined between two sets as the ratio of the volume - of their intersection divided by the volume of their union. In the context + of their intersection over the volume of their union. In the context of graphs, the neighborhood of a vertex is seen as a set. The Jaccard similarity weight of each edge represents the strength of connection between vertices based on the relative similarity of their neighbors. diff --git a/python/cugraph/cugraph/link_prediction/overlap.py b/python/cugraph/cugraph/link_prediction/overlap.py index 5f6f74dba59..52697d6b552 100644 --- a/python/cugraph/cugraph/link_prediction/overlap.py +++ b/python/cugraph/cugraph/link_prediction/overlap.py @@ -152,7 +152,7 @@ def overlap( Compute the Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the + of their intersection over the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap Coefficient weight of each edge represents the strength of connection between vertices based on the relative similarity of their @@ -284,7 +284,7 @@ def all_pairs_overlap( Compute the All Pairs Overlap Coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Overlap Coefficient is defined between two sets as the ratio of the volume - of their intersection divided by the smaller of their two volumes. In the + of their intersection over the smaller of their two volumes. In the context of graphs, the neighborhood of a vertex is seen as a set. The Overlap Coefficient weight of each edge represents the strength of connection between vertices based on the relative similarity of their diff --git a/python/cugraph/cugraph/link_prediction/sorensen.py b/python/cugraph/cugraph/link_prediction/sorensen.py index 584fe0dcbc9..8030234993b 100644 --- a/python/cugraph/cugraph/link_prediction/sorensen.py +++ b/python/cugraph/cugraph/link_prediction/sorensen.py @@ -67,7 +67,7 @@ def sorensen( Compute the Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. + volume of their intersection over the volume of each set. If first is specified but second is not, or vice versa, an exception will be thrown. @@ -280,10 +280,10 @@ def all_pairs_sorensen( topk: int = None, ): """ - Compute All Pairs the Sorensen coefficient between each pair of vertices connected + Compute the All Pairs Sorensen coefficient between each pair of vertices connected by an edge, or between arbitrary pairs of vertices specified by the user. Sorensen coefficient is defined between two sets as the ratio of twice the - volume of their intersection divided by the volume of each set. + volume of their intersection over the volume of each set. If first is specified but second is not, or vice versa, an exception will be thrown. From 4a843f841db14708460c5aa95b376583338a5ddb Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Fri, 26 Jul 2024 09:24:33 -0700 Subject: [PATCH 32/33] remove debug statement --- python/cugraph/cugraph/dask/link_prediction/cosine.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/cugraph/cugraph/dask/link_prediction/cosine.py b/python/cugraph/cugraph/dask/link_prediction/cosine.py index cac80738de0..e4007ad96d5 100644 --- a/python/cugraph/cugraph/dask/link_prediction/cosine.py +++ b/python/cugraph/cugraph/dask/link_prediction/cosine.py @@ -51,8 +51,6 @@ def convert_to_cudf(cp_arrays): def _call_plc_all_pairs_cosine( sID, mg_graph_x, vertices, use_weight, topk, do_expensive_check ): - print("vertices = ", vertices) - print("topk = ", topk) return pylibcugraph_all_pairs_cosine_coefficients( resource_handle=ResourceHandle(Comms.get_handle(sID).getHandle()), From 756d32b15476ba63db90e4ef7a713a9ac73e650c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Sun, 28 Jul 2024 22:00:38 -0700 Subject: [PATCH 33/33] fix typo --- python/cugraph/cugraph/link_prediction/cosine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cugraph/cugraph/link_prediction/cosine.py b/python/cugraph/cugraph/link_prediction/cosine.py index 6d5c5dcf188..9dce0e96f8c 100644 --- a/python/cugraph/cugraph/link_prediction/cosine.py +++ b/python/cugraph/cugraph/link_prediction/cosine.py @@ -1,4 +1,4 @@ -# Copyright (c) 22024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at