diff --git a/CHANGELOG.md b/CHANGELOG.md index ed5706245ea..d756b7dcec8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ - PR #1260 Add katz_centrality mnmg wrapper - PR #1264 CuPy sparse matrix input support for WCC, SCC, SSSP, and BFS - PR #1265 Implement Hungarian Algorithm +- PR #1274 Add generic from_edgelist() and from_adjlist() APIs ## Improvements - PR #1227 Pin cmake policies to cmake 3.17 version diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py index 18a50160f99..7cbab698cb7 100644 --- a/python/cugraph/__init__.py +++ b/python/cugraph/__init__.py @@ -29,6 +29,7 @@ from cugraph.structure import ( Graph, DiGraph, + from_edgelist, from_cudf_edgelist, from_pandas_edgelist, to_pandas_edgelist, @@ -38,6 +39,7 @@ to_numpy_array, from_numpy_matrix, to_numpy_matrix, + from_adjlist, hypergraph, symmetrize, symmetrize_df, @@ -70,7 +72,7 @@ from cugraph.traversal import ( bfs, - bfs_edges, + bfs_edges, sssp, shortest_path, filter_unreachable, diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py index b8b6fbe0435..34447e80ee9 100644 --- a/python/cugraph/structure/__init__.py +++ b/python/cugraph/structure/__init__.py @@ -14,7 +14,8 @@ from cugraph.structure.graph import Graph, DiGraph from cugraph.structure.number_map import NumberMap from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf -from cugraph.structure.convert_matrix import (from_cudf_edgelist, +from cugraph.structure.convert_matrix import (from_edgelist, + from_cudf_edgelist, from_pandas_edgelist, to_pandas_edgelist, from_pandas_adjacency, @@ -22,6 +23,7 @@ from_numpy_array, to_numpy_array, from_numpy_matrix, - to_numpy_matrix) + to_numpy_matrix, + from_adjlist) from cugraph.structure.hypergraph import hypergraph from cugraph.structure.shuffle import shuffle diff --git a/python/cugraph/structure/convert_matrix.py b/python/cugraph/structure/convert_matrix.py index 8acdc7e1799..edd1c630185 100644 --- a/python/cugraph/structure/convert_matrix.py +++ b/python/cugraph/structure/convert_matrix.py @@ -15,8 +15,133 @@ # issue #146 is addressed, this file's extension should be changed from .pyx to # .py and should be located outside the python/cugraph/bindings directory. +import cudf +import dask_cudf + from cugraph.structure.graph import DiGraph, Graph +# optional dependencies used for handling different input types +try: + import pandas as pd +except ModuleNotFoundError: + pd = None + + +def from_edgelist(df, source='source', destination='destination', + edge_attr=None, create_using=Graph, renumber=True): + """ + Return a new graph created from the edge list representaion. + + Parameters + ---------- + df : cudf.DataFrame, pandas.DataFrame, dask_cudf.core.DataFrame + This DataFrame contains columns storing edge source vertices, + destination (or target following NetworkX's terminology) vertices, and + (optional) weights. + source : string or integer + This is used to index the source column. + destination : string or integer + This is used to index the destination (or target following NetworkX's + terminology) column. + edge_attr : string or integer, optional + This pointer can be ``None``. If not, this is used to index the weight + column. + create_using : cuGraph.Graph + Specify the type of Graph to create. Default is cugraph.Graph + renumber : bool + If source and destination indices are not in range 0 to V where V + is number of vertices, renumber argument should be True. + + Examples + -------- + >>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ', + >>> dtype=['int32', 'int32', 'float32'], header=None) + >>> G = cugraph.Graph() + >>> G = cugraph.from_edgelist(M, source='0', destination='1', + edge_attr='2') + """ + df_type = type(df) + + if df_type is cudf.DataFrame: + return from_cudf_edgelist(df, source, destination, + edge_attr, create_using, renumber) + + elif (pd is not None) and (df_type is pd.DataFrame): + return from_pandas_edgelist(df, source, destination, + edge_attr, create_using, renumber) + + elif df_type is dask_cudf.core.DataFrame: + if create_using in [Graph, DiGraph]: + G = create_using() + else: + raise TypeError(f"'create_using' is type {create_using}, must be " + "either a cugraph.Graph or cugraph.DiGraph") + G.from_dask_cudf_edgelist(df, source, destination, edge_attr, renumber) + return G + + else: + raise TypeError(f"obj of type {df_type} is not supported.") + + +def from_adjlist(offsets, indices, values=None, create_using=Graph): + """ + Initializes the graph from cuDF or Pandas Series representing adjacency + matrix CSR data and returns a new cugraph.Graph object if 'create_using' is + set to cugraph.Graph (the default), or cugraph.DiGraph if 'create_using' is + set to cugraph.DiGraph. + + Parameters + ---------- + offsets : cudf.Series, pandas.Series + The offsets of a CSR adjacency matrix. + indices : cudf.Series, pandas.Series + The indices of a CSR adjacency matrix. + values : cudf.Series, pandas.Series, or None (default), optional + The values in a CSR adjacency matrix, which represent edge weights in a + graph. If not provided, the resulting graph is considered unweighted. + create_using : cuGraph.Graph + Specify the type of Graph to create. Default is cugraph.Graph + + Examples + -------- + >>> pdf = pd.read_csv('datasets/karate.csv', delimiter=' ', + ... dtype={0:'int32', 1:'int32', 2:'float32'}, + ... header=None) + >>> M = scipy.sparse.coo_matrix((pdf[2],(pdf[0],pdf[1]))) + >>> M = M.tocsr() + >>> offsets = pd.Series(M.indptr) + >>> indices = pd.Series(M.indices) + >>> G = cugraph.from_adjlist(offsets, indices, None) + """ + offsets_type = type(offsets) + indices_type = type(indices) + if offsets_type != indices_type: + raise TypeError(f"'offsets' type {offsets_type} != 'indices' " + f"type {indices_type}") + if values is not None: + values_type = type(values) + if values_type != offsets_type: + raise TypeError(f"'values' type {values_type} != 'offsets' " + f"type {offsets_type}") + + if create_using in [Graph, DiGraph]: + G = create_using() + else: + raise TypeError(f"'create_using' is type {create_using}, must be " + "either a cugraph.Graph or cugraph.DiGraph") + + if offsets_type is cudf.Series: + G.from_cudf_adjlist(offsets, indices, values) + + elif (pd is not None) and (offsets_type is pd.Series): + G.from_cudf_adjlist(cudf.Series(offsets), cudf.Series(indices), + None if values is None else cudf.Series(values)) + + else: + raise TypeError(f"obj of type {offsets_type} is not supported.") + + return G + def from_cudf_edgelist(df, source='source', destination='destination', edge_attr=None, create_using=Graph, renumber=True): @@ -52,7 +177,6 @@ def from_cudf_edgelist(df, source='source', destination='destination', >>> dtype=['int32', 'int32', 'float32'], header=None) >>> G = cugraph.Graph() >>> G = cugraph.from_cudf_edgelist(M, source='0', target='1', weight='2') - """ if create_using is Graph: G = Graph() diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py index 9479960d8e6..fdeaff536ac 100644 --- a/python/cugraph/structure/graph.py +++ b/python/cugraph/structure/graph.py @@ -539,7 +539,6 @@ def to_numpy_matrix(self): """ Returns the graph adjacency matrix as a NumPy matrix. """ - np_array = self.to_numpy_array() return np.asmatrix(np_array) diff --git a/python/cugraph/tests/dask/test_mg_utility.py b/python/cugraph/tests/dask/test_mg_utility.py index e802a65c37f..808f1bcfa70 100644 --- a/python/cugraph/tests/dask/test_mg_utility.py +++ b/python/cugraph/tests/dask/test_mg_utility.py @@ -28,6 +28,13 @@ from cugraph.tests import utils +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + @pytest.fixture def client_connection(): cluster = LocalCUDACluster() @@ -44,9 +51,33 @@ def client_connection(): @pytest.mark.skipif( is_single_gpu(), reason="skipping MG testing on Single GPU system" ) -def test_compute_local_data(client_connection): +def test_from_edgelist(client_connection): + input_data_path = r"../datasets/karate.csv" + chunksize = dcg.get_chunksize(input_data_path) + ddf = dask_cudf.read_csv( + input_data_path, + chunksize=chunksize, + delimiter=" ", + names=["src", "dst", "value"], + dtype=["int32", "int32", "float32"], + ) - gc.collect() + dg1 = cugraph.from_edgelist( + ddf, source="src", destination="dst", edge_attr="value", + create_using=cugraph.DiGraph) + + dg2 = cugraph.DiGraph() + dg2.from_dask_cudf_edgelist( + ddf, source="src", destination="dst", edge_attr="value" + ) + + assert dg1.EdgeList == dg2.EdgeList + + +@pytest.mark.skipif( + is_single_gpu(), reason="skipping MG testing on Single GPU system" +) +def test_compute_local_data(client_connection): input_data_path = r"../datasets/karate.csv" chunksize = dcg.get_chunksize(input_data_path) diff --git a/python/cugraph/tests/test_convert_matrix.py b/python/cugraph/tests/test_convert_matrix.py index 29e64f700f4..d418dd7ce2e 100644 --- a/python/cugraph/tests/test_convert_matrix.py +++ b/python/cugraph/tests/test_convert_matrix.py @@ -29,10 +29,15 @@ import networkx as nx -@pytest.mark.parametrize("graph_file", utils.DATASETS) -def test_to_from_pandas(graph_file): +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): gc.collect() + +@pytest.mark.parametrize("graph_file", utils.DATASETS) +def test_to_from_pandas(graph_file): # Read in the graph M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) @@ -79,8 +84,6 @@ def test_to_from_pandas(graph_file): @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_from_to_numpy(graph_file): - gc.collect() - # Read in the graph M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) @@ -145,3 +148,49 @@ def test_from_to_numpy(graph_file): res_pdf = res_pdf[['src', 'dst', 'weights']] assert exp_pdf.equals(res_pdf) + + +@pytest.mark.parametrize("graph_file", utils.DATASETS) +def test_from_edgelist(graph_file): + """ + Compare the resulting Graph objs from cugraph.from_edgelist() calls of both + a cudf and pandas DataFrame and ensure the results are equal. + """ + df = utils.read_csv_file(graph_file) + pdf = utils.read_csv_for_nx(graph_file) + + G1 = cugraph.from_edgelist(df, source="0", destination="1") + G2 = cugraph.from_edgelist(pdf, source="0", destination="1") + + assert G1.EdgeList == G2.EdgeList + + +@pytest.mark.parametrize("graph_file", utils.DATASETS) +def test_from_adjlist(graph_file): + """ + Compare the resulting Graph objs from cugraph.from_adjlist() calls of both + a cudf and pandas DataFrame and ensure the results are equal. + """ + G = utils.generate_cugraph_graph_from_file(graph_file, directed=True) + (cu_offsets, cu_indices, cu_vals) = G.view_adj_list() + + pd_offsets = cu_offsets.to_pandas() + pd_indices = cu_indices.to_pandas() + if cu_vals is not None: + pd_vals = cu_vals.to_pandas() + else: + pd_vals = None + + # FIXME: should mixing types be allowed? + with pytest.raises(TypeError): + G1 = cugraph.from_adjlist(cu_offsets, pd_indices) + with pytest.raises(TypeError): + G1 = cugraph.from_adjlist(cu_offsets, cu_indices, cu_vals, + create_using=33) + + G1 = cugraph.from_adjlist(cu_offsets, cu_indices, cu_vals, + create_using=cugraph.DiGraph) + G2 = cugraph.from_adjlist(pd_offsets, pd_indices, pd_vals, + create_using=cugraph.DiGraph) + + assert G1.AdjList == G2.AdjList diff --git a/python/cugraph/tests/test_graph.py b/python/cugraph/tests/test_graph.py index 59d0d5c4e09..a912ecfa41a 100644 --- a/python/cugraph/tests/test_graph.py +++ b/python/cugraph/tests/test_graph.py @@ -42,6 +42,13 @@ import networkx as nx +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + def compare_series(series_1, series_2): assert len(series_1) == len(series_2) df = cudf.DataFrame({"series_1": series_1, "series_2": series_2}) @@ -151,15 +158,12 @@ def check_all_two_hops(df, M): def test_version(): - gc.collect() cugraph.__version__ # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_add_edge_list_to_adj_list(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) M = utils.read_csv_for_nx(graph_file) @@ -180,8 +184,6 @@ def test_add_edge_list_to_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_add_adj_list_to_edge_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix( @@ -208,8 +210,6 @@ def test_add_adj_list_to_edge_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_view_edge_list_from_adj_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 Mcsr = scipy.sparse.csr_matrix( @@ -231,8 +231,6 @@ def test_view_edge_list_from_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_delete_edge_list_delete_adj_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) @@ -261,8 +259,6 @@ def test_delete_edge_list_delete_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): - gc.collect() - Mnx = utils.read_csv_for_nx(graph_file) df = cudf.DataFrame() df["src"] = cudf.Series(Mnx["0"]) @@ -302,8 +298,6 @@ def test_add_edge_or_adj_list_after_add_edge_or_adj_list(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_edges_for_Graph(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) # Create nx Graph @@ -342,8 +336,6 @@ def test_edges_for_Graph(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_view_edge_list_for_Graph(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) # Create nx Graph @@ -387,8 +379,6 @@ def test_view_edge_list_for_Graph(graph_file): # Test @pytest.mark.parametrize('graph_file', utils.DATASETS) def test_consolidation(graph_file): - gc.collect() - cluster = LocalCUDACluster() client = Client(cluster) chunksize = dcg.get_chunksize(graph_file) @@ -423,8 +413,6 @@ def test_consolidation(graph_file): # Test @pytest.mark.parametrize('graph_file', utils.DATASETS_SMALL) def test_two_hop_neighbors(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) G = cugraph.DiGraph() @@ -444,8 +432,6 @@ def test_two_hop_neighbors(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_degree_functionality(graph_file): - gc.collect() - M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) @@ -484,8 +470,6 @@ def test_degree_functionality(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_degrees_functionality(graph_file): - gc.collect() - M = utils.read_csv_for_nx(graph_file) cu_M = utils.read_csv_file(graph_file) @@ -517,8 +501,6 @@ def test_degrees_functionality(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_number_of_vertices(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) M = utils.read_csv_for_nx(graph_file) @@ -537,8 +519,6 @@ def test_number_of_vertices(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) def test_to_directed(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) M = utils.read_csv_for_nx(graph_file) @@ -566,8 +546,6 @@ def test_to_directed(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) def test_to_undirected(graph_file): - gc.collect() - # Read data and then convert to directed by dropped some edges cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) @@ -602,8 +580,6 @@ def test_to_undirected(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_has_edge(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) cu_M = cu_M[cu_M["0"] <= cu_M["1"]].reset_index(drop=True) @@ -619,8 +595,6 @@ def test_has_edge(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_has_node(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique() @@ -632,13 +606,10 @@ def test_has_node(graph_file): assert G.has_node(n) -# Test all combinations of default/managed and pooled/non-pooled allocation @pytest.mark.parametrize('graph_file', utils.DATASETS) def test_bipartite_api(graph_file): # This test only tests the functionality of adding set of nodes and # retrieving them. The datasets currently used are not truly bipartite. - gc.collect() - cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M['0'], cu_M['1']]).unique() @@ -670,8 +641,6 @@ def test_bipartite_api(graph_file): # Test @pytest.mark.parametrize("graph_file", utils.DATASETS) def test_neighbors(graph_file): - gc.collect() - cu_M = utils.read_csv_file(graph_file) nodes = cudf.concat([cu_M["0"], cu_M["1"]]).unique() M = utils.read_csv_for_nx(graph_file)