-
Notifications
You must be signed in to change notification settings - Fork 311
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimize the drop-duplicate functionality #4095
Changes from 32 commits
883644c
6113946
f92d936
64ec680
2e4b0a7
0695fed
0442e53
fd98039
1d87370
1d39ec6
c4d9580
ad62f33
779bd2d
5900bd8
705788f
5fa81c4
c85f622
8f7436e
7c165e7
fc5e627
50a11c0
f3f1c3f
77fe0f5
7db4a10
b3077cf
bdcd215
2c6892a
ff5373a
1efef76
ebbd33f
ed76013
890f885
ccc5189
63a669b
a6d767f
e7ea077
ad2602c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2021-2023, NVIDIA CORPORATION. | ||
# Copyright (c) 2021-2024, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
|
@@ -264,7 +264,7 @@ def __from_edgelist( | |
source, | ||
destination, | ||
edge_attr, | ||
multi=self.properties.multi_edge, | ||
multi=True, # Deprecated parameter | ||
symmetrize=not self.properties.directed, | ||
) | ||
|
||
|
@@ -279,7 +279,7 @@ def __from_edgelist( | |
elist, | ||
source, | ||
destination, | ||
multi=self.properties.multi_edge, | ||
multi=True, # Deprecated parameter | ||
symmetrize=not self.properties.directed, | ||
) | ||
|
||
|
@@ -298,7 +298,10 @@ def __from_edgelist( | |
self._replicate_edgelist() | ||
|
||
self._make_plc_graph( | ||
value_col=value_col, store_transposed=store_transposed, renumber=renumber | ||
value_col=value_col, | ||
store_transposed=store_transposed, | ||
renumber=renumber, | ||
drop_multi_edges=not self.properties.multi_edge, | ||
) | ||
|
||
def to_pandas_edgelist( | ||
|
@@ -477,13 +480,15 @@ def view_edge_list(self): | |
edgelist_df[simpleGraphImpl.srcCol] | ||
<= edgelist_df[simpleGraphImpl.dstCol] | ||
] | ||
|
||
elif not use_initial_input_df and self.properties.renumbered: | ||
# Do not unrenumber the vertices if the initial input df was used | ||
if not self.properties.directed: | ||
edgelist_df = edgelist_df[ | ||
edgelist_df[simpleGraphImpl.srcCol] | ||
<= edgelist_df[simpleGraphImpl.dstCol] | ||
] | ||
|
||
edgelist_df = self.renumber_map.unrenumber( | ||
edgelist_df, simpleGraphImpl.srcCol | ||
) | ||
|
@@ -506,6 +511,18 @@ def view_edge_list(self): | |
simpleGraphImpl.dstCol: dstCol, | ||
} | ||
) | ||
if not self.properties.multi_edge: | ||
if type(srcCol) is list and type(dstCol) is list: | ||
vertex_col_name = srcCol + dstCol | ||
|
||
else: | ||
vertex_col_name = [srcCol, dstCol] | ||
|
||
# Drop parallel edges for non MultiGraph | ||
# FIXME: Drop multi edges with the CAPI instead. | ||
edgelist_df = edgelist_df.groupby( | ||
by=[*vertex_col_name], as_index=False | ||
).min() | ||
|
||
# FIXME: When renumbered, the MG API uses renumbered col names which | ||
# is not consistant with the SG API. | ||
|
@@ -827,6 +844,14 @@ def number_of_edges(self, directed_edges=False): | |
""" | ||
Get the number of edges in the graph. | ||
""" | ||
if not self.properties.multi_edge: | ||
# | ||
# Drop parallel edges for non MultiGraph | ||
# FIXME: Drop multi edges with the CAPI instead. | ||
if self.edgelist is not None: | ||
self.edgelist.edgelist_df = self.edgelist.edgelist_df.groupby( | ||
by=[simpleGraphImpl.srcCol, simpleGraphImpl.dstCol], as_index=False | ||
).min() | ||
# TODO: Move to Outer graphs? | ||
if directed_edges and self.edgelist is not None: | ||
return len(self.edgelist.edgelist_df) | ||
|
@@ -1084,6 +1109,7 @@ def _make_plc_graph( | |
value_col: Dict[str, cudf.DataFrame] = None, | ||
store_transposed: bool = False, | ||
renumber: bool = True, | ||
drop_multi_edges: bool = False, | ||
): | ||
""" | ||
Parameters | ||
|
@@ -1100,6 +1126,8 @@ def _make_plc_graph( | |
Whether to renumber the vertices of the graph. | ||
Required if inputted vertex ids are not of | ||
int32 or int64 type. | ||
drop_multi_edges: bool (default=False) | ||
Whether to drop multi edges | ||
""" | ||
|
||
if value_col is None: | ||
|
@@ -1163,6 +1191,7 @@ def _make_plc_graph( | |
renumber=renumber, | ||
do_expensive_check=True, | ||
input_array_format=input_array_format, | ||
drop_multi_edges=drop_multi_edges, | ||
) | ||
|
||
def to_directed(self, DiG, store_transposed=False): | ||
|
@@ -1306,6 +1335,14 @@ def neighbors(self, n): | |
n = node[0] | ||
|
||
df = self.edgelist.edgelist_df | ||
|
||
vertex_col_name = [simpleGraphImpl.srcCol, simpleGraphImpl.dstCol] | ||
|
||
if not self.properties.multi_edge: | ||
# Drop parallel edges for non MultiGraph | ||
# FIXME: Drop multi edges with the CAPI instead. | ||
df = df.groupby(by=[*vertex_col_name], as_index=False).min() | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same question here , assuming this a cuDF dataframe. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes it is a cuDF dataframe There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks |
||
neighbors = df[df[simpleGraphImpl.srcCol] == n][ | ||
simpleGraphImpl.dstCol | ||
].reset_index(drop=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
# Copyright (c) 2021-2023, NVIDIA CORPORATION. | ||
# Copyright (c) 2021-2024, NVIDIA CORPORATION. | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
|
@@ -1176,10 +1176,12 @@ def test_extract_subgraph_vertex_prop_condition_only( | |
) | ||
# Should result in two edges, one a "relationship", the other a "referral" | ||
expected_edgelist = cudf.DataFrame( | ||
{"src": [89216, 78634], "dst": [78634, 89216], "weights": [99, 8]} | ||
{"src": [78634, 89216], "dst": [89216, 78634], "weights": [8, 99]} | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wonder why we flipped it around ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I initially removed the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for fixing this. |
||
|
||
if G.renumbered: | ||
# FIXME: Can only use the attribute 'edgelist.edgelist_df' for directed | ||
# graphs | ||
actual_edgelist = G.unrenumber( | ||
G.edgelist.edgelist_df, "src", preserve_order=True | ||
) | ||
|
@@ -1472,7 +1474,7 @@ def test_extract_subgraph_no_query(dataset1_PropertyGraph, as_pg_first): | |
# referrals has 3 edges with the same src/dst, so subtract 2 from | ||
# the total count since this is not creating a multigraph.. | ||
num_edges -= 2 | ||
assert len(G.edgelist.edgelist_df) == num_edges | ||
assert len(G.view_edge_list()) == num_edges | ||
|
||
|
||
@pytest.mark.sg | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just confirming,
edgelist_df
is acuDF
data frame right ?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it is a cuDF dataframe
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks