Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] FEA generic from_edgelist() and from_adjlist() APIs #1274

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
- PR #1260 Add katz_centrality mnmg wrapper
- PR #1264 CuPy sparse matrix input support for WCC, SCC, SSSP, and BFS
- PR #1265 Implement Hungarian Algorithm
- PR #1274 Add generic from_edgelist() and from_adjlist() APIs

## Improvements
- PR #1227 Pin cmake policies to cmake 3.17 version
Expand Down
4 changes: 3 additions & 1 deletion python/cugraph/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
from cugraph.structure import (
Graph,
DiGraph,
from_edgelist,
from_cudf_edgelist,
from_pandas_edgelist,
to_pandas_edgelist,
Expand All @@ -38,6 +39,7 @@
to_numpy_array,
from_numpy_matrix,
to_numpy_matrix,
from_adjlist,
hypergraph,
symmetrize,
symmetrize_df,
Expand Down Expand Up @@ -70,7 +72,7 @@

from cugraph.traversal import (
bfs,
bfs_edges,
bfs_edges,
sssp,
shortest_path,
filter_unreachable,
Expand Down
6 changes: 4 additions & 2 deletions python/cugraph/structure/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,16 @@
from cugraph.structure.graph import Graph, DiGraph
from cugraph.structure.number_map import NumberMap
from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf
from cugraph.structure.convert_matrix import (from_cudf_edgelist,
from cugraph.structure.convert_matrix import (from_edgelist,
from_cudf_edgelist,
from_pandas_edgelist,
to_pandas_edgelist,
from_pandas_adjacency,
to_pandas_adjacency,
from_numpy_array,
to_numpy_array,
from_numpy_matrix,
to_numpy_matrix)
to_numpy_matrix,
from_adjlist)
from cugraph.structure.hypergraph import hypergraph
from cugraph.structure.shuffle import shuffle
126 changes: 125 additions & 1 deletion python/cugraph/structure/convert_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,133 @@
# issue #146 is addressed, this file's extension should be changed from .pyx to
# .py and should be located outside the python/cugraph/bindings directory.

import cudf
import dask_cudf

from cugraph.structure.graph import DiGraph, Graph

# optional dependencies used for handling different input types
try:
import pandas as pd
except ModuleNotFoundError:
pd = None


def from_edgelist(df, source='source', destination='destination',
edge_attr=None, create_using=Graph, renumber=True):
"""
Return a new graph created from the edge list representaion.

Parameters
----------
df : cudf.DataFrame, pandas.DataFrame, dask_cudf.core.DataFrame
This DataFrame contains columns storing edge source vertices,
destination (or target following NetworkX's terminology) vertices, and
(optional) weights.
source : string or integer
This is used to index the source column.
destination : string or integer
This is used to index the destination (or target following NetworkX's
terminology) column.
edge_attr : string or integer, optional
This pointer can be ``None``. If not, this is used to index the weight
column.
create_using : cuGraph.Graph
Specify the type of Graph to create. Default is cugraph.Graph
renumber : bool
If source and destination indices are not in range 0 to V where V
is number of vertices, renumber argument should be True.

Examples
--------
>>> M = cudf.read_csv('datasets/karate.csv', delimiter=' ',
>>> dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G = cugraph.from_edgelist(M, source='0', destination='1',
edge_attr='2')
"""
df_type = type(df)

if df_type is cudf.DataFrame:
return from_cudf_edgelist(df, source, destination,
edge_attr, create_using, renumber)

elif (pd is not None) and (df_type is pd.DataFrame):
return from_pandas_edgelist(df, source, destination,
edge_attr, create_using, renumber)

elif df_type is dask_cudf.core.DataFrame:
if create_using in [Graph, DiGraph]:
G = create_using()
else:
raise TypeError(f"'create_using' is type {create_using}, must be "
"either a cugraph.Graph or cugraph.DiGraph")
G.from_dask_cudf_edgelist(df, source, destination, edge_attr, renumber)
return G

else:
raise TypeError(f"obj of type {df_type} is not supported.")


def from_adjlist(offsets, indices, values=None, create_using=Graph):
"""
Initializes the graph from cuDF or Pandas Series representing adjacency
matrix CSR data and returns a new cugraph.Graph object if 'create_using' is
set to cugraph.Graph (the default), or cugraph.DiGraph if 'create_using' is
set to cugraph.DiGraph.

Parameters
----------
offsets : cudf.Series, pandas.Series
The offsets of a CSR adjacency matrix.
indices : cudf.Series, pandas.Series
The indices of a CSR adjacency matrix.
values : cudf.Series, pandas.Series, or None (default), optional
The values in a CSR adjacency matrix, which represent edge weights in a
graph. If not provided, the resulting graph is considered unweighted.
create_using : cuGraph.Graph
Specify the type of Graph to create. Default is cugraph.Graph

Examples
--------
>>> pdf = pd.read_csv('datasets/karate.csv', delimiter=' ',
... dtype={0:'int32', 1:'int32', 2:'float32'},
... header=None)
>>> M = scipy.sparse.coo_matrix((pdf[2],(pdf[0],pdf[1])))
>>> M = M.tocsr()
>>> offsets = pd.Series(M.indptr)
>>> indices = pd.Series(M.indices)
>>> G = cugraph.from_adjlist(offsets, indices, None)
"""
offsets_type = type(offsets)
indices_type = type(indices)
if offsets_type != indices_type:
raise TypeError(f"'offsets' type {offsets_type} != 'indices' "
f"type {indices_type}")
if values is not None:
values_type = type(values)
if values_type != offsets_type:
raise TypeError(f"'values' type {values_type} != 'offsets' "
f"type {offsets_type}")

if create_using in [Graph, DiGraph]:
G = create_using()
else:
raise TypeError(f"'create_using' is type {create_using}, must be "
"either a cugraph.Graph or cugraph.DiGraph")

if offsets_type is cudf.Series:
G.from_cudf_adjlist(offsets, indices, values)

elif (pd is not None) and (offsets_type is pd.Series):
G.from_cudf_adjlist(cudf.Series(offsets), cudf.Series(indices),
None if values is None else cudf.Series(values))

else:
raise TypeError(f"obj of type {offsets_type} is not supported.")

return G


def from_cudf_edgelist(df, source='source', destination='destination',
edge_attr=None, create_using=Graph, renumber=True):
Expand Down Expand Up @@ -52,7 +177,6 @@ def from_cudf_edgelist(df, source='source', destination='destination',
>>> dtype=['int32', 'int32', 'float32'], header=None)
>>> G = cugraph.Graph()
>>> G = cugraph.from_cudf_edgelist(M, source='0', target='1', weight='2')

"""
if create_using is Graph:
G = Graph()
Expand Down
1 change: 0 additions & 1 deletion python/cugraph/structure/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -539,7 +539,6 @@ def to_numpy_matrix(self):
"""
Returns the graph adjacency matrix as a NumPy matrix.
"""

np_array = self.to_numpy_array()
return np.asmatrix(np_array)

Expand Down
35 changes: 33 additions & 2 deletions python/cugraph/tests/dask/test_mg_utility.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,13 @@
from cugraph.tests import utils


# =============================================================================
# Pytest Setup / Teardown - called for each test function
# =============================================================================
def setup_function():
gc.collect()


@pytest.fixture
def client_connection():
cluster = LocalCUDACluster()
Expand All @@ -44,9 +51,33 @@ def client_connection():
@pytest.mark.skipif(
is_single_gpu(), reason="skipping MG testing on Single GPU system"
)
def test_compute_local_data(client_connection):
def test_from_edgelist(client_connection):
input_data_path = r"../datasets/karate.csv"
chunksize = dcg.get_chunksize(input_data_path)
ddf = dask_cudf.read_csv(
input_data_path,
chunksize=chunksize,
delimiter=" ",
names=["src", "dst", "value"],
dtype=["int32", "int32", "float32"],
)

gc.collect()
dg1 = cugraph.from_edgelist(
ddf, source="src", destination="dst", edge_attr="value",
create_using=cugraph.DiGraph)

dg2 = cugraph.DiGraph()
dg2.from_dask_cudf_edgelist(
ddf, source="src", destination="dst", edge_attr="value"
)

assert dg1.EdgeList == dg2.EdgeList


@pytest.mark.skipif(
is_single_gpu(), reason="skipping MG testing on Single GPU system"
)
def test_compute_local_data(client_connection):

input_data_path = r"../datasets/karate.csv"
chunksize = dcg.get_chunksize(input_data_path)
Expand Down
57 changes: 53 additions & 4 deletions python/cugraph/tests/test_convert_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,15 @@
import networkx as nx


@pytest.mark.parametrize("graph_file", utils.DATASETS)
def test_to_from_pandas(graph_file):
# =============================================================================
# Pytest Setup / Teardown - called for each test function
# =============================================================================
def setup_function():
gc.collect()


@pytest.mark.parametrize("graph_file", utils.DATASETS)
def test_to_from_pandas(graph_file):
# Read in the graph
M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)

Expand Down Expand Up @@ -79,8 +84,6 @@ def test_to_from_pandas(graph_file):

@pytest.mark.parametrize("graph_file", utils.DATASETS)
def test_from_to_numpy(graph_file):
gc.collect()

# Read in the graph
M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)

Expand Down Expand Up @@ -145,3 +148,49 @@ def test_from_to_numpy(graph_file):
res_pdf = res_pdf[['src', 'dst', 'weights']]

assert exp_pdf.equals(res_pdf)


@pytest.mark.parametrize("graph_file", utils.DATASETS)
def test_from_edgelist(graph_file):
"""
Compare the resulting Graph objs from cugraph.from_edgelist() calls of both
a cudf and pandas DataFrame and ensure the results are equal.
"""
df = utils.read_csv_file(graph_file)
pdf = utils.read_csv_for_nx(graph_file)

G1 = cugraph.from_edgelist(df, source="0", destination="1")
G2 = cugraph.from_edgelist(pdf, source="0", destination="1")

assert G1.EdgeList == G2.EdgeList


@pytest.mark.parametrize("graph_file", utils.DATASETS)
def test_from_adjlist(graph_file):
"""
Compare the resulting Graph objs from cugraph.from_adjlist() calls of both
a cudf and pandas DataFrame and ensure the results are equal.
"""
G = utils.generate_cugraph_graph_from_file(graph_file, directed=True)
(cu_offsets, cu_indices, cu_vals) = G.view_adj_list()

pd_offsets = cu_offsets.to_pandas()
pd_indices = cu_indices.to_pandas()
if cu_vals is not None:
pd_vals = cu_vals.to_pandas()
else:
pd_vals = None

# FIXME: should mixing types be allowed?
with pytest.raises(TypeError):
G1 = cugraph.from_adjlist(cu_offsets, pd_indices)
with pytest.raises(TypeError):
G1 = cugraph.from_adjlist(cu_offsets, cu_indices, cu_vals,
create_using=33)

G1 = cugraph.from_adjlist(cu_offsets, cu_indices, cu_vals,
create_using=cugraph.DiGraph)
G2 = cugraph.from_adjlist(pd_offsets, pd_indices, pd_vals,
create_using=cugraph.DiGraph)

assert G1.AdjList == G2.AdjList
Loading