From 47b8debd0a17b1b00dd65fa7689d47d97f3e9438 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Oct 2022 19:05:15 -0700 Subject: [PATCH 1/6] match the source to the edgelist dtype --- python/cugraph/cugraph/dask/traversal/bfs.py | 24 +++++++++++++------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/python/cugraph/cugraph/dask/traversal/bfs.py b/python/cugraph/cugraph/dask/traversal/bfs.py index bbf1a5faabb..5b6e83727fe 100644 --- a/python/cugraph/cugraph/dask/traversal/bfs.py +++ b/python/cugraph/cugraph/dask/traversal/bfs.py @@ -35,18 +35,20 @@ def convert_to_cudf(cp_arrays): return df -def _call_plc_bfs(sID, mg_graph_x, st_x, depth_limit=None, return_distances=True): +def _call_plc_bfs( + sID, mg_graph_x, st_x, depth_limit=None, direction_optimizing=False, + return_distances=True, do_expensive_check=False): + print("python: the start type in _call_plc_bfs is \n", st_x.dtype) return pylibcugraph_bfs( ResourceHandle(Comms.get_handle(sID).getHandle()), - mg_graph_x, - cudf.Series(st_x, dtype="int32"), - False, - depth_limit if depth_limit is not None else 0, - return_distances, - True, + graph=mg_graph_x, + sources=st_x, + direction_optimizing=direction_optimizing, + depth_limit=depth_limit if depth_limit is not None else 0, + compute_predecessors=return_distances, + do_expensive_check=do_expensive_check, ) - def bfs(input_graph, start, depth_limit=None, return_distances=True, check_start=True): """ Find the distances and predecessors for a breadth-first traversal of a @@ -156,6 +158,10 @@ def bfs(input_graph, start, depth_limit=None, return_distances=True, check_start start = input_graph.lookup_internal_vertex_id(start, tmp_col_names) data_start = get_distributed_data(start) + do_expensive_check = False + # FIXME: Why is 'direction_optimizing' not part of the python cugraph API + # and why is it set to 'False' by default + direction_optimizing = False cupy_result = [ client.submit( @@ -164,7 +170,9 @@ def bfs(input_graph, start, depth_limit=None, return_distances=True, check_start input_graph._plc_graph[w], st[0], depth_limit, + direction_optimizing, return_distances, + do_expensive_check, workers=[w], allow_other_workers=False, ) From fd6b788e218cc93d277531231691555875aa877c Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Oct 2022 19:09:39 -0700 Subject: [PATCH 2/6] match the renumbered df dtype to the edgelist dtype --- .../cugraph/cugraph/structure/number_map.py | 14 ++++++++-- .../cugraph/tests/mg/test_mg_renumber.py | 28 +++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 2ae5bdcc326..eea1036c932 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -258,7 +258,9 @@ def indirection_map(self, ddf, src_col_names, dst_col_names): # Set global index tmp_ddf = tmp_ddf.assign(idx=1) - tmp_ddf["global_id"] = tmp_ddf.idx.cumsum() - 1 + # ensure the original vertex and the 'global_id' columns are + # of the same type unless the original vertex type is 'string' + tmp_ddf["global_id"] = tmp_ddf.idx.cumsum().astype(self.id_type) - 1 tmp_ddf = tmp_ddf.drop(columns="idx") tmp_ddf = tmp_ddf.persist() self.ddf = tmp_ddf @@ -479,6 +481,11 @@ def renumber_and_segment( legacy_renum_only=False, ): renumbered = True + + # This assumes that the vertex columns are of the same type + # which should be the case + id_type = df.dtypes[0] + # FIXME: Drop the renumber_type 'experimental' once all the # algos follow the C/Pylibcugraph path @@ -490,6 +497,9 @@ def renumber_and_segment( df[src_col_names].dtype == np.int32 or df[src_col_names].dtype == np.int64 ): renumber_type = "legacy" + # If the vertices are of type 'string', set the renumber id_type + # to "int32" + id_type = "int32" else: # The renumber_type 'experimental' only runs the C++ # renumbering @@ -500,7 +510,7 @@ def renumber_and_segment( renumber_type = "skip_renumbering" renumbered = False - renumber_map = NumberMap() + renumber_map = NumberMap(id_type=id_type) if not isinstance(src_col_names, list): src_col_names = [src_col_names] dst_col_names = [dst_col_names] diff --git a/python/cugraph/cugraph/tests/mg/test_mg_renumber.py b/python/cugraph/cugraph/tests/mg/test_mg_renumber.py index 1340558732f..0d1d82dea61 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_renumber.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_renumber.py @@ -308,7 +308,7 @@ def test_pagerank_string_vertex_ids(dask_client): G.from_cudf_edgelist(df, source="src", destination="dst") sg_results = cugraph.pagerank(G) - sg_results = sg_results.sort_values("pagerank").reset_index(drop=True) + sg_results = sg_results.sort_values("vertex").reset_index(drop=True) # MG ddf = dask_cudf.from_cudf(df, npartitions=2) @@ -318,7 +318,31 @@ def test_pagerank_string_vertex_ids(dask_client): mg_results = dcg.pagerank(G_dask) # Organize results for easy comparison, this does not change the values. MG # Pagerank defaults to float64, so convert to float32 when comparing to SG - mg_results = mg_results.compute().sort_values("pagerank").reset_index(drop=True) + mg_results = mg_results.compute().sort_values("vertex").reset_index(drop=True) mg_results["pagerank"] = mg_results["pagerank"].astype("float32") assert_frame_equal(sg_results, mg_results) + + +@pytest.mark.parametrize("dtype", ["int32", "int64"]) +def test_mg_renumber_multi_column(dtype, dask_client): + df = cudf.DataFrame( + {"src_a":[i for i in range(0,10)], "dst_a":[i for i in range(10, 20)]}).\ + astype(dtype) + + df["src_b"] = df["src_a"] + 10 + df["dst_b"] = df["dst_a"] + 20 + src_col = ["src_a", "src_b"] + dst_col = ["dst_a", "dst_b"] + + ddf = dask_cudf.from_cudf(df, npartitions=2) + edgelist_type = list(ddf.dtypes) + G = cugraph.Graph() + G.from_dask_cudf_edgelist(ddf, source=src_col, destination=dst_col) + renumbered_ddf = G.edgelist.edgelist_df + renumbered_edgelist_type = list(renumbered_ddf.dtypes) + + assert set(renumbered_edgelist_type).issubset(set(edgelist_type)) + + + From 0154980b6eef42367e7d2ec9de01bb106869d9f3 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Oct 2022 19:12:16 -0700 Subject: [PATCH 3/6] fix style --- python/cugraph/cugraph/dask/traversal/bfs.py | 11 +++++++++-- python/cugraph/cugraph/tests/mg/test_mg_renumber.py | 7 ++----- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/python/cugraph/cugraph/dask/traversal/bfs.py b/python/cugraph/cugraph/dask/traversal/bfs.py index 5b6e83727fe..8ccfb52d096 100644 --- a/python/cugraph/cugraph/dask/traversal/bfs.py +++ b/python/cugraph/cugraph/dask/traversal/bfs.py @@ -36,8 +36,14 @@ def convert_to_cudf(cp_arrays): def _call_plc_bfs( - sID, mg_graph_x, st_x, depth_limit=None, direction_optimizing=False, - return_distances=True, do_expensive_check=False): + sID, + mg_graph_x, + st_x, + depth_limit=None, + direction_optimizing=False, + return_distances=True, + do_expensive_check=False, +): print("python: the start type in _call_plc_bfs is \n", st_x.dtype) return pylibcugraph_bfs( ResourceHandle(Comms.get_handle(sID).getHandle()), @@ -49,6 +55,7 @@ def _call_plc_bfs( do_expensive_check=do_expensive_check, ) + def bfs(input_graph, start, depth_limit=None, return_distances=True, check_start=True): """ Find the distances and predecessors for a breadth-first traversal of a diff --git a/python/cugraph/cugraph/tests/mg/test_mg_renumber.py b/python/cugraph/cugraph/tests/mg/test_mg_renumber.py index 0d1d82dea61..d273dacfd0c 100644 --- a/python/cugraph/cugraph/tests/mg/test_mg_renumber.py +++ b/python/cugraph/cugraph/tests/mg/test_mg_renumber.py @@ -327,8 +327,8 @@ def test_pagerank_string_vertex_ids(dask_client): @pytest.mark.parametrize("dtype", ["int32", "int64"]) def test_mg_renumber_multi_column(dtype, dask_client): df = cudf.DataFrame( - {"src_a":[i for i in range(0,10)], "dst_a":[i for i in range(10, 20)]}).\ - astype(dtype) + {"src_a": [i for i in range(0, 10)], "dst_a": [i for i in range(10, 20)]} + ).astype(dtype) df["src_b"] = df["src_a"] + 10 df["dst_b"] = df["dst_a"] + 20 @@ -343,6 +343,3 @@ def test_mg_renumber_multi_column(dtype, dask_client): renumbered_edgelist_type = list(renumbered_ddf.dtypes) assert set(renumbered_edgelist_type).issubset(set(edgelist_type)) - - - From 207b5e7ff3ce6565e1f5167ee416b599a61e388f Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Oct 2022 19:23:42 -0700 Subject: [PATCH 4/6] update docstrings --- python/cugraph/cugraph/structure/number_map.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index eea1036c932..5921dfb2f71 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -497,8 +497,7 @@ def renumber_and_segment( df[src_col_names].dtype == np.int32 or df[src_col_names].dtype == np.int64 ): renumber_type = "legacy" - # If the vertices are of type 'string', set the renumber id_type - # to "int32" + # If the vertices are non-integers, set the renumber 'id_type' to "int32" id_type = "int32" else: # The renumber_type 'experimental' only runs the C++ From e334adff2ff08c0bd2959d1ab2b62bba614b8b24 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Oct 2022 20:52:16 -0700 Subject: [PATCH 5/6] remove debug print --- python/cugraph/cugraph/dask/traversal/bfs.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cugraph/cugraph/dask/traversal/bfs.py b/python/cugraph/cugraph/dask/traversal/bfs.py index 8ccfb52d096..c6b9f8c8366 100644 --- a/python/cugraph/cugraph/dask/traversal/bfs.py +++ b/python/cugraph/cugraph/dask/traversal/bfs.py @@ -44,7 +44,6 @@ def _call_plc_bfs( return_distances=True, do_expensive_check=False, ): - print("python: the start type in _call_plc_bfs is \n", st_x.dtype) return pylibcugraph_bfs( ResourceHandle(Comms.get_handle(sID).getHandle()), graph=mg_graph_x, From c2ff6c06bb9828b16d58bcac7a52ee072ae17d8d Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Tue, 25 Oct 2022 15:52:08 -0700 Subject: [PATCH 6/6] update docstrings --- .../cugraph/cugraph/structure/number_map.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/python/cugraph/cugraph/structure/number_map.py b/python/cugraph/cugraph/structure/number_map.py index 5921dfb2f71..128e479a227 100644 --- a/python/cugraph/cugraph/structure/number_map.py +++ b/python/cugraph/cugraph/structure/number_map.py @@ -266,9 +266,10 @@ def indirection_map(self, ddf, src_col_names, dst_col_names): self.ddf = tmp_ddf return tmp_ddf - def __init__(self, id_type=np.int32, renumber_type=None): + def __init__(self, renumber_id_type=np.int32, unrenumbered_id_type=np.int32): self.implementation = None - self.id_type = id_type + self.renumber_id_type = renumber_id_type + self.unrenumbered_id_type = unrenumbered_id_type # The default src/dst column names in the resulting renumbered # dataframe. These may be updated by the renumbering methods if the # input dataframe uses the default names. @@ -482,9 +483,19 @@ def renumber_and_segment( ): renumbered = True - # This assumes that the vertex columns are of the same type - # which should be the case - id_type = df.dtypes[0] + # For columns with mismatch dtypes, set the renumbered + # id_type to either 'int32' or 'int64' + if df.dtypes.nunique() > 1: + # can't determine the edgelist input type + unrenumbered_id_type = None + else: + unrenumbered_id_type = df.dtypes[0] + + if np.int64 in list(df.dtypes): + renumber_id_type = np.int64 + else: + # renumber the edgelist to 'int32' + renumber_id_type = np.int32 # FIXME: Drop the renumber_type 'experimental' once all the # algos follow the C/Pylibcugraph path @@ -493,12 +504,11 @@ def renumber_and_segment( # C++ renumbering. if isinstance(src_col_names, list): renumber_type = "legacy" + elif not ( df[src_col_names].dtype == np.int32 or df[src_col_names].dtype == np.int64 ): renumber_type = "legacy" - # If the vertices are non-integers, set the renumber 'id_type' to "int32" - id_type = "int32" else: # The renumber_type 'experimental' only runs the C++ # renumbering @@ -509,7 +519,7 @@ def renumber_and_segment( renumber_type = "skip_renumbering" renumbered = False - renumber_map = NumberMap(id_type=id_type) + renumber_map = NumberMap(renumber_id_type, unrenumbered_id_type) if not isinstance(src_col_names, list): src_col_names = [src_col_names] dst_col_names = [dst_col_names] @@ -521,11 +531,19 @@ def renumber_and_segment( if isinstance(df, cudf.DataFrame): renumber_map.implementation = NumberMap.SingleGPU( - df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed + df, + src_col_names, + dst_col_names, + renumber_map.renumber_id_type, + store_transposed, ) elif isinstance(df, dask_cudf.DataFrame): renumber_map.implementation = NumberMap.MultiGPU( - df, src_col_names, dst_col_names, renumber_map.id_type, store_transposed + df, + src_col_names, + dst_col_names, + renumber_map.renumber_id_type, + store_transposed, ) else: raise TypeError("df must be cudf.DataFrame or dask_cudf.DataFrame")