From 5adf76f2a965d9b9350dc625e79a020f5b253aa5 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Wed, 18 Nov 2020 11:23:37 -0600 Subject: [PATCH 1/3] Initial version and test for cugraph.from_edgelist(). --- python/cugraph/__init__.py | 3 +- python/cugraph/structure/__init__.py | 3 +- python/cugraph/structure/convert_matrix.py | 55 +++++++++++++++++++- python/cugraph/tests/test_graph.py | 60 ++++++++-------------- 4 files changed, 80 insertions(+), 41 deletions(-) diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py index feab1cc3eb9..5848489e6fa 100644 --- a/python/cugraph/__init__.py +++ b/python/cugraph/__init__.py @@ -29,6 +29,7 @@ from cugraph.structure import ( Graph, DiGraph, + from_edgelist, from_cudf_edgelist, from_pandas_edgelist, to_pandas_edgelist, @@ -70,7 +71,7 @@ from cugraph.traversal import ( bfs, - bfs_edges, + bfs_edges, sssp, shortest_path, filter_unreachable, diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py index b8b6fbe0435..f148c6dd9d7 100644 --- a/python/cugraph/structure/__init__.py +++ b/python/cugraph/structure/__init__.py @@ -14,7 +14,8 @@ from cugraph.structure.graph import Graph, DiGraph from cugraph.structure.number_map import NumberMap from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf -from cugraph.structure.convert_matrix import (from_cudf_edgelist, +from cugraph.structure.convert_matrix import (from_edgelist, + from_cudf_edgelist, from_pandas_edgelist, to_pandas_edgelist, from_pandas_adjacency, diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py index 8acdc7e1799..750af97ea01 100644 --- a/python/cugraph/structure/convert_matrix.py +++ b/python/cugraph/structure/convert_matrix.py @@ -15,8 +15,62 @@ # issue #146 is addressed, this file's extension should be changed from .pyx to # .py and should be located outside the python/cugraph/bindings directory. +import cudf + from cugraph.structure.graph import DiGraph, Graph +# optional dependencies used for handling different input types +try: + import pandas as pd +except ModuleNotFoundError: + pd = None + + +def from_edgelist(df, source='source', destination='destination', + edge_attr=None, create_using=Graph, renumber=True): + """ + Return a new graph created from the edge list representaion. + + Parameters + ---------- + df : cudf.DataFrame, pandas.DataFrame + This DataFrame contains columns storing edge source vertices, + destination (or target following NetworkX's terminology) vertices, and + (optional) weights. + source : string or integer + This is used to index the source column. + destination : string or integer + This is used to index the destination (or target following NetworkX's + terminology) column. + edge_attr : string or integer, optional + This pointer can be ``None``. If not, this is used to index the weight + column. + create_using : cuGraph.Graph + Specify the type of Graph to create. Default is cugraph.Graph + renumber : bool + If source and destination indices are not in range 0 to V where V + is number of vertices, renumber argument should be True. + + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G = cugraph.from_edgelist(M, source='0', target='1', weight='2') + """ + df_type = type(df) + + if df_type is cudf.DataFrame: + return from_cudf_edgelist(df, source, destination, + edge_attr, create_using, renumber) + + elif (pd is not None) and (df_type is pd.DataFrame): + return from_pandas_edgelist(df, source, destination, + edge_attr, create_using, renumber) + + else: + raise TypeError(f"obj of type {df_type} is not supported.") + def from_cudf_edgelist(df, source='source', destination='destination', edge_attr=None, create_using=Graph, renumber=True): @@ -52,7 +106,6 @@ def from_cudf_edgelist(df, source='source', destination='destination', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G = cugraph.from_cudf_edgelist(M, source='0', target='1', weight='2') - """ if create_using is Graph: G = Graph() diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py index 59d0d5c4e09..2a6b2a37f2b 100644 --- a/python/cugraph/tests/test_graph.py +++ b/python/cugraph/tests/test_graph.py @@ -42,6 +42,13 @@ import networkx as nx +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + def compare_series(series_1, series_2): assert len(series_1) == len(series_2) df = cudf.DataFrame({"series_1": series_1, "series_2": series_2}) @@ -151,15 +158,12 @@ def check_all_two_hops(df, M): def test_version(): - gc.collect() cugraph.__version__ # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_add_edge_list_to_adj_list(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) M = utils.read_csv_for_nx(graph_file) @@ -180,8 +184,6 @@ def test_add_edge_list_to_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_add_adj_list_to_edge_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix( @@ -208,8 +210,6 @@ def test_add_adj_list_to_edge_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_view_edge_list_from_adj_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix( @@ -231,8 +231,6 @@ def test_view_edge_list_from_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_delete_edge_list_delete_adj_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) @@ -261,8 +259,6 @@ def test_delete_edge_list_delete_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) @@ -302,8 +298,6 @@ def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_edges_for_Graph(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) # Create nx Graph @@ -342,8 +336,6 @@ def test_edges_for_Graph(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_view_edge_list_for_Graph(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) # Create nx Graph @@ -387,8 +379,6 @@ def test_view_edge_list_for_Graph(graph_file): # Test @pytest.mark.parametrize('graph_file', utils.DATASETS) def test_consolidation(graph_file): - gc.collect() - cluster = LocalCUDACluster() client = Client(cluster) chunksize = dcg.get_chunksize(graph_file) @@ -423,8 +413,6 @@ def test_consolidation(graph_file): # Test @pytest.mark.parametrize('graph_file', utils.DATASETS_SMALL) def test_two_hop_neighbors(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() @@ -444,8 +432,6 @@ def test_two_hop_neighbors(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_degree_functionality(graph_file): - gc.collect() - M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) @@ -484,8 +470,6 @@ def test_degree_functionality(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_degrees_functionality(graph_file): - gc.collect() - M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) @@ -517,8 +501,6 @@ def test_degrees_functionality(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_number_of_vertices(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) M = utils.read_csv_for_nx(graph_file) @@ -537,8 +519,6 @@ def test_number_of_vertices(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) def test_to_directed(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) M = utils.read_csv_for_nx(graph_file) @@ -566,8 +546,6 @@ def test_to_directed(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) def test_to_undirected(graph_file): - gc.collect() - # Read data and then convert to directed by dropped some edges cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) @@ -602,8 +580,6 @@ def test_to_undirected(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_has_edge(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) @@ -619,8 +595,6 @@ def test_has_edge(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_has_node(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique() @@ -632,13 +606,10 @@ def test_has_node(graph_file): assert G.has_node(n) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.parametrize('graph_file', utils.DATASETS) def test_bipartite_api(graph_file): # This test only tests the functionality of adding set of nodes and # retrieving them. The datasets currently used are not truly bipartite. - gc.collect() - cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M['0'], cu_M['1']]).unique() @@ -670,8 +641,6 @@ def test_bipartite_api(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_neighbors(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique() M = utils.read_csv_for_nx(graph_file) @@ -687,3 +656,18 @@ def test_neighbors(graph_file): cu_neighbors.sort() nx_neighbors.sort() assert cu_neighbors == nx_neighbors + + +@pytest.mark.parametrize("graph_file", utils.DATASETS) +def test_from_edgelist(graph_file): + """ + Compare the resulting Graph objs from cugraph.from_edgelist() calls of both + a cudf and pandas DataFrame and ensure the results are equal. + """ + cu_M = utils.read_csv_file(graph_file) + M = utils.read_csv_for_nx(graph_file) + + G1 = cugraph.from_edgelist(cu_M, source="0", destination="1") + G2 = cugraph.from_edgelist(M, source="0", destination="1") + + assert G1.EdgeList == G2.EdgeList From 11bd691110c0c97780d8ab474db43894b1e48908 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Thu, 19 Nov 2020 15:36:15 -0600 Subject: [PATCH 2/3] Fixed example in docstring, added PR 1274 to CHANGELOG.md --- CHANGELOG.md | 1 + python/cugraph/structure/convert_matrix.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d1802221c14..017ff151ab1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ ## New Features - PR #1245 Add functions to add pandas and numpy compatibility - PR #1260 Add katz_centrality mnmg wrapper +- PR #1274 Add generic from_edgelist() API ## Improvements - PR #1227 Pin cmake policies to cmake 3.17 version diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py index 750af97ea01..614315bbc52 100644 --- a/python/cugraph/structure/convert_matrix.py +++ b/python/cugraph/structure/convert_matrix.py @@ -56,7 +56,7 @@ def from_edgelist(df, source='source', destination='destination', >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() - >>> G = cugraph.from_edgelist(M, source='0', target='1', weight='2') + >>> G = cugraph.from_edgelist(M, source='0', destination='1', edge_attr='2') """ df_type = type(df) From 5ea02102a8eed2b83f616600d1d69ed46c585b18 Mon Sep 17 00:00:00 2001 From: Rick Ratzel Date: Mon, 23 Nov 2020 20:06:10 -0600 Subject: [PATCH 3/3] Added support for dask_cudf DataFrame to from_edgelist(). --- python/cugraph/structure/convert_matrix.py | 12 ++++++- python/cugraph/tests/dask/test_mg_utility.py | 35 ++++++++++++++++++-- python/cugraph/tests/test_convert_matrix.py | 11 +++--- 3 files changed, 51 insertions(+), 7 deletions(-) diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py index e8d89541653..edd1c630185 100644 --- a/python/cugraph/structure/convert_matrix.py +++ b/python/cugraph/structure/convert_matrix.py @@ -16,6 +16,7 @@ # .py and should be located outside the python/cugraph/bindings directory. import cudf +import dask_cudf from cugraph.structure.graph import DiGraph, Graph @@ -33,7 +34,7 @@ def from_edgelist(df, source='source', destination='destination', Parameters ---------- - df : cudf.DataFrame, pandas.DataFrame + df : cudf.DataFrame, pandas.DataFrame, dask_cudf.core.DataFrame This DataFrame contains columns storing edge source vertices, destination (or target following NetworkX's terminology) vertices, and (optional) weights. @@ -69,6 +70,15 @@ def from_edgelist(df, source='source', destination='destination', return from_pandas_edgelist(df, source, destination, edge_attr, create_using, renumber) + elif df_type is dask_cudf.core.DataFrame: + if create_using in [Graph, DiGraph]: + G = create_using() + else: + raise TypeError(f"'create_using' is type {create_using}, must be " + "either a cugraph.Graph or cugraph.DiGraph") + G.from_dask_cudf_edgelist(df, source, destination, edge_attr, renumber) + return G + else: raise TypeError(f"obj of type {df_type} is not supported.") diff --git a/python/cugraph/tests/dask/test_mg_utility.py b/python/cugraph/tests/dask/test_mg_utility.py index e802a65c37f..808f1bcfa70 100644 --- a/python/cugraph/tests/dask/test_mg_utility.py +++ b/python/cugraph/tests/dask/test_mg_utility.py @@ -28,6 +28,13 @@ from cugraph.tests import utils +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + @pytest.fixture def client_connection(): cluster = LocalCUDACluster() @@ -44,9 +51,33 @@ def client_connection(): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -def test_compute_local_data(client_connection): +def test_from_edgelist(client_connection): + input_data_path = r"../datasets/karate.csv" + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + chunksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) - gc.collect() + dg1 = cugraph.from_edgelist( + ddf, source="src", destination="dst", edge_attr="value", + create_using=cugraph.DiGraph) + + dg2 = cugraph.DiGraph() + dg2.from_dask_cudf_edgelist( + ddf, source="src", destination="dst", edge_attr="value" + ) + + assert dg1.EdgeList == dg2.EdgeList + + +@pytest.mark.skipif( + is_single_gpu(), reason="skipping MG testing on Single GPU system" +) +def test_compute_local_data(client_connection): input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) diff --git a/python/cugraph/tests/test_convert_matrix.py b/python/cugraph/tests/test_convert_matrix.py index f2dd59071a8..d418dd7ce2e 100644 --- a/python/cugraph/tests/test_convert_matrix.py +++ b/python/cugraph/tests/test_convert_matrix.py @@ -29,10 +29,15 @@ import networkx as nx -@pytest.mark.parametrize("graph_file", utils.DATASETS) -def test_to_from_pandas(graph_file): +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): gc.collect() + +@pytest.mark.parametrize("graph_file", utils.DATASETS) +def test_to_from_pandas(graph_file): # Read in the graph M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) @@ -79,8 +84,6 @@ def test_to_from_pandas(graph_file): @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_from_to_numpy(graph_file): - gc.collect() - # Read in the graph M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)