Merge pull request #1280 from Iroy30/add_multigraph

[REVIEW] enable multigraph
rapidsai · Jan 21, 2021 · da66ecf · da66ecf
2 parents e205fd0 + 8a5bdc3
commit da66ecf
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,7 @@
 - PR #1279 Add self loop check variable in graph
 - PR #1277 SciPy sparse matrix input support for WCC, SCC, SSSP, and BFS
 - PR #1278 Add support for shortest_path_length and fix graph vertex checks
+- PR #1280 Add Multi(Di)Graph support
 
 ## Improvements
 - PR #1227 Pin cmake policies to cmake 3.17 version

diff --git a/python/cugraph/__init__.py b/python/cugraph/__init__.py
@@ -29,6 +29,8 @@
 from cugraph.structure import (
     Graph,
     DiGraph,
+    MultiGraph,
+    MultiDiGraph,
     from_edgelist,
     from_cudf_edgelist,
     from_pandas_edgelist,

diff --git a/python/cugraph/structure/__init__.py b/python/cugraph/structure/__init__.py
@@ -11,7 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from cugraph.structure.graph import Graph, DiGraph
+from cugraph.structure.graph import Graph, DiGraph, MultiGraph, MultiDiGraph
 from cugraph.structure.number_map import NumberMap
 from cugraph.structure.symmetrize import symmetrize, symmetrize_df , symmetrize_ddf
 from cugraph.structure.convert_matrix import (from_edgelist,

diff --git a/python/cugraph/structure/graph.py b/python/cugraph/structure/graph.py
@@ -73,7 +73,6 @@ def __init__(self, offsets, indices, value=None):
     def __init__(
         self,
         m_graph=None,
-        edge_attr=None,
         symmetrized=False,
         bipartite=False,
         multi=False,
@@ -113,24 +112,22 @@ def __init__(
         self.batch_transposed_adjlists = None
 
         if m_graph is not None:
-            if (type(self) is Graph and type(m_graph) is MultiGraph) or (
-                type(self) is DiGraph and type(m_graph) is MultiDiGraph
-            ):
-                self.from_cudf_edgelist(
-                    m_graph.edgelist.edgelist_df,
-                    source="src",
-                    destination="dst",
-                    edge_attr=edge_attr,
-                )
-                self.renumbered = m_graph.renumbered
-                self.renumber_map = m_graph.renumber_map
+            if type(m_graph) is MultiGraph or type(m_graph) is MultiDiGraph:
+                elist = m_graph.view_edge_list()
+                if m_graph.edgelist.weights:
+                    weights = "weights"
+                else:
+                    weights = None
+                self.from_cudf_edgelist(elist,
+                                        source="src",
+                                        destination="dst",
+                                        edge_attr=weights)
             else:
                 msg = (
-                    "Graph can be initialized using MultiGraph "
-                    "and DiGraph can be initialized using MultiDiGraph"
+                    "Graph can only be initialized using MultiGraph "
+                    "or MultiDiGraph"
                 )
                 raise Exception(msg)
-        # self.number_of_vertices = None
 
     def enable_batch(self):
         client = mg_utils.get_client()
@@ -278,6 +275,12 @@ def is_multipartite(self):
         # TO DO: Call coloring algorithm
         return self.multipartite or self.bipartite
 
+    def is_multigraph(self):
+        """
+        Returns True if the graph is a multigraph. Else returns False.
+        """
+        return self.multi
+
     def sets(self):
         """
         Returns the bipartite set of nodes. This solely relies on the user's
@@ -409,24 +412,19 @@ def from_cudf_edgelist(
         source_col = elist[source]
         dest_col = elist[destination]
 
-        if self.multi:
-            if type(edge_attr) is not list:
-                raise Exception("edge_attr should be a list of column names")
-            value_col = {}
-            for col_name in edge_attr:
-                value_col[col_name] = elist[col_name]
-        elif edge_attr is not None:
+        if edge_attr is not None:
             value_col = elist[edge_attr]
         else:
             value_col = None
 
-        if not self.symmetrized and not self.multi:
-            if value_col is not None:
-                source_col, dest_col, value_col = symmetrize(
-                    source_col, dest_col, value_col
-                )
-            else:
-                source_col, dest_col = symmetrize(source_col, dest_col)
+        if value_col is not None:
+            source_col, dest_col, value_col = symmetrize(
+                source_col, dest_col, value_col, multi=self.multi,
+                symmetrize=not self.symmetrized)
+        else:
+            source_col, dest_col = symmetrize(
+                source_col, dest_col, multi=self.multi,
+                symmetrize=not self.symmetrized)
 
         self.edgelist = Graph.EdgeList(source_col, dest_col, value_col)
 
@@ -727,7 +725,7 @@ def view_edge_list(self):
             edgelist_df = self.unrenumber(edgelist_df, "src")
             edgelist_df = self.unrenumber(edgelist_df, "dst")
 
-        if type(self) is Graph:
+        if type(self) is Graph or type(self) is MultiGraph:
             edgelist_df = edgelist_df[edgelist_df["src"] <= edgelist_df["dst"]]
             edgelist_df = edgelist_df.reset_index(drop=True)
             self.edge_count = len(edgelist_df)
@@ -1019,7 +1017,7 @@ def number_of_edges(self, directed_edges=False):
             return len(self.edgelist.edgelist_df)
         if self.edge_count is None:
             if self.edgelist is not None:
-                if type(self) is Graph:
+                if type(self) is Graph or type(self) is MultiGraph:
                     self.edge_count = len(
                         self.edgelist.edgelist_df[
                             self.edgelist.edgelist_df["src"]
@@ -1511,17 +1509,26 @@ def add_internal_vertex_id(
 
 
 class DiGraph(Graph):
-    def __init__(self, m_graph=None, edge_attr=None):
+    """
+    cuGraph directed graph class. Drops parallel edges.
+    """
+    def __init__(self, m_graph=None):
         super().__init__(
-            m_graph=m_graph, edge_attr=edge_attr, symmetrized=True
+            m_graph=m_graph, symmetrized=True
         )
 
 
 class MultiGraph(Graph):
+    """
+    cuGraph class to create and store undirected graphs with parallel edges.
+    """
     def __init__(self, renumbered=True):
         super().__init__(multi=True)
 
 
 class MultiDiGraph(Graph):
+    """
+    cuGraph class to create and store directed graphs with parallel edges.
+    """
     def __init__(self, renumbered=True):
         super().__init__(symmetrized=True, multi=True)
diff --git a/python/cugraph/structure/symmetrize.py b/python/cugraph/structure/symmetrize.py
@@ -16,7 +16,7 @@
 import dask_cudf
 
 
-def symmetrize_df(df, src_name, dst_name):
+def symmetrize_df(df, src_name, dst_name, multi=False, symmetrize=True):
     """
     Take a COO stored in a DataFrame, along with the column names of
     the source and destination columns and create a new data frame
@@ -42,6 +42,13 @@ def symmetrize_df(df, src_name, dst_name):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
+    multi : bool
+        Set to True if graph is a Multi(Di)Graph. This allows multiple
+        edges instead of dropping them.
+    symmetrize : bool
+        Default is True to perform symmetrization. If False only duplicate
+        edges are dropped.
+
     Examples
     --------
     >>> import cugraph.dask as dcg
@@ -54,26 +61,30 @@ def symmetrize_df(df, src_name, dst_name):
     >>> sym_ddf = cugraph.symmetrize_ddf(ddf, "src", "dst", "weight")
     >>> Comms.destroy()
     """
-    gdf = cudf.DataFrame()
-
     #
     #  Now append the columns.  We add sources to the end of destinations,
     #  and destinations to the end of sources.  Otherwise we append a
     #  column onto itself.
     #
-    for idx, name in enumerate(df.columns):
-        if name == src_name:
-            gdf[src_name] = df[src_name].append(
-                df[dst_name], ignore_index=True
-            )
-        elif name == dst_name:
-            gdf[dst_name] = df[dst_name].append(
-                df[src_name], ignore_index=True
-            )
-        else:
-            gdf[name] = df[name].append(df[name], ignore_index=True)
-
-    return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
+    if symmetrize:
+        gdf = cudf.DataFrame()
+        for idx, name in enumerate(df.columns):
+            if name == src_name:
+                gdf[src_name] = df[src_name].append(
+                    df[dst_name], ignore_index=True
+                )
+            elif name == dst_name:
+                gdf[dst_name] = df[dst_name].append(
+                    df[src_name], ignore_index=True
+                )
+            else:
+                gdf[name] = df[name].append(df[name], ignore_index=True)
+    else:
+        gdf = df
+    if multi:
+        return gdf
+    else:
+        return gdf.groupby(by=[src_name, dst_name], as_index=False).min()
 
 
 def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
@@ -105,6 +116,12 @@ def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
         Name of the column in the data frame containing the source ids
     dst_name : string
         Name of the column in the data frame containing the destination ids
+    multi : bool
+        Set to True if graph is a Multi(Di)Graph. This allows multiple
+        edges instead of dropping them.
+    symmetrize : bool
+        Default is True to perform symmetrization. If False only duplicate
+        edges are dropped.
 
     Examples
     --------
@@ -129,7 +146,8 @@ def symmetrize_ddf(df, src_name, dst_name, weight_name=None):
     return result
 
 
-def symmetrize(source_col, dest_col, value_col=None):
+def symmetrize(source_col, dest_col, value_col=None, multi=False,
+               symmetrize=True):
     """
     Take a COO set of source destination pairs along with associated values
     stored in a single GPU or distributed
@@ -190,7 +208,8 @@ def symmetrize(source_col, dest_col, value_col=None):
             input_df, "source", "destination", weight_name
         ).persist()
     else:
-        output_df = symmetrize_df(input_df, "source", "destination")
+        output_df = symmetrize_df(input_df, "source", "destination", multi,
+                                  symmetrize)
 
     if value_col is not None:
         return (

diff --git a/python/cugraph/tests/test_multigraph.py b/python/cugraph/tests/test_multigraph.py
@@ -0,0 +1,104 @@
+import cugraph
+import networkx as nx
+from cugraph.tests import utils
+import pytest
+import gc
+import numpy as np
+
+
+# =============================================================================
+# Pytest Setup / Teardown - called for each test function
+# =============================================================================
+def setup_function():
+    gc.collect()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_multigraph(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    G = cugraph.MultiDiGraph()
+    G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiDiGraph(),
+    )
+
+    assert G.number_of_edges() == Gnx.number_of_edges()
+    assert G.number_of_nodes() == Gnx.number_of_nodes()
+    cuedges = cugraph.to_pandas_edgelist(G)
+    cuedges.rename(columns={"src": "source", "dst": "target",
+                   "weights": "weight"}, inplace=True)
+    cuedges["weight"] = cuedges["weight"].round(decimals=3)
+    nxedges = nx.to_pandas_edgelist(Gnx).astype(dtype={"source": "int32",
+                                                       "target": "int32",
+                                                       "weight": "float32"})
+    cuedges = cuedges.sort_values(by=["source", "target"]).\
+        reset_index(drop=True)
+    nxedges = nxedges.sort_values(by=["source", "target"]).\
+        reset_index(drop=True)
+    nxedges["weight"] = nxedges["weight"].round(decimals=3)
+    assert nxedges.equals(cuedges[["source", "target", "weight"]])
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_Graph_from_MultiGraph(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    GM = cugraph.MultiGraph()
+    GM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    GnxM = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiGraph(),
+    )
+
+    G = cugraph.Graph(GM)
+    Gnx = nx.Graph(GnxM)
+    assert Gnx.number_of_edges() == G.number_of_edges()
+
+    GdM = cugraph.MultiDiGraph()
+    GdM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    GnxdM = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiGraph(),
+    )
+    Gd = cugraph.DiGraph(GdM)
+    Gnxd = nx.DiGraph(GnxdM)
+    assert Gnxd.number_of_edges() == Gd.number_of_edges()
+
+
+@pytest.mark.parametrize("graph_file", utils.DATASETS)
+def test_multigraph_sssp(graph_file):
+    # FIXME: Migrate to new test fixtures for Graph setup once available
+    cuM = utils.read_csv_file(graph_file)
+    G = cugraph.MultiDiGraph()
+    G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2")
+    cu_paths = cugraph.sssp(G, 0)
+    max_val = np.finfo(cu_paths["distance"].dtype).max
+    cu_paths = cu_paths[cu_paths["distance"] != max_val]
+    nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True)
+    Gnx = nx.from_pandas_edgelist(
+        nxM,
+        source="0",
+        target="1",
+        edge_attr="weight",
+        create_using=nx.MultiDiGraph(),
+    )
+    nx_paths = nx.single_source_dijkstra_path_length(Gnx, 0)
+
+    cu_dist = cu_paths.sort_values(by='vertex')['distance'].to_array()
+    nx_dist = [i[1] for i in sorted(nx_paths.items())]
+
+    assert (cu_dist == nx_dist).all()
diff --git a/python/cugraph/traversal/sssp.py b/python/cugraph/traversal/sssp.py
@@ -14,7 +14,7 @@
 import numpy as np
 
 import cudf
-from cugraph.structure import Graph, DiGraph
+from cugraph.structure import Graph, DiGraph, MultiGraph, MultiDiGraph
 from cugraph.traversal import sssp_wrapper
 from cugraph.utilities import (ensure_cugraph_obj,
                                is_matrix_type,
@@ -104,7 +104,7 @@ def _convert_df_to_output_type(df, input_type, return_predecessors):
     return_predecessors is only used for return values from cupy/scipy input
     types.
     """
-    if input_type in [Graph, DiGraph]:
+    if input_type in [Graph, DiGraph, MultiGraph, MultiDiGraph]:
         return df
 
     elif (nx is not None) and (input_type in [nx.Graph, nx.DiGraph]):