Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Optimize PG.add_data #2924

Merged
merged 3 commits into from
Nov 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ def add_vertex_data(
tmp_df[self.vertex_col_name] = tmp_df[vertex_col_name]
# FIXME: handle case of a type_name column already being in tmp_df

# FIXME: We should do categorization first
# Related issue: https://github.com/rapidsai/cugraph/issues/2903
tmp_df[TCN] = type_name
tmp_df[TCN] = tmp_df[TCN].astype(cat_dtype)

Expand Down Expand Up @@ -626,6 +628,9 @@ def add_edge_data(
tmp_df[self.src_col_name] = tmp_df[vertex_col_names[0]]
tmp_df[self.dst_col_name] = tmp_df[vertex_col_names[1]]

# FIXME: We should do categorization first
# Related issue: https://github.com/rapidsai/cugraph/issues/2903

tmp_df[TCN] = type_name
tmp_df[TCN] = tmp_df[TCN].astype(cat_dtype)

Expand Down
6 changes: 4 additions & 2 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,8 @@ def add_vertex_data(
if self.__series_type is cudf.Series:
# cudf does not yet support initialization with a scalar
tmp_df[TCN] = cudf.Series(
np.repeat(type_name, len(tmp_df)), index=tmp_df.index, dtype=cat_dtype
cudf.Series([type_name], dtype=cat_dtype).repeat(len(tmp_df)),
index=tmp_df.index,
)
else:
# pandas is oddly slow if dtype is passed to the constructor here
Expand Down Expand Up @@ -909,7 +910,8 @@ def add_edge_data(
if self.__series_type is cudf.Series:
# cudf does not yet support initialization with a scalar
tmp_df[TCN] = cudf.Series(
np.repeat(type_name, len(tmp_df)), index=tmp_df.index, dtype=cat_dtype
cudf.Series([type_name], dtype=cat_dtype).repeat(len(tmp_df)),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice one doing this operation on GPU.

index=tmp_df.index,
)
else:
# pandas is oddly slow if dtype is passed to the constructor here
Expand Down
15 changes: 15 additions & 0 deletions python/cugraph/cugraph/tests/test_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pandas as pd
import numpy as np
import cudf
import cupy as cp
from cudf.testing import assert_frame_equal, assert_series_equal
from cugraph.experimental.datasets import cyber

Expand Down Expand Up @@ -1965,6 +1966,20 @@ def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
)


@pytest.mark.parametrize("n_rows", [15_000_000, 30_000_000, 60_000_000, 120_000_000])
def bench_add_edge_data(gpubenchmark, n_rows):
from cugraph.experimental import PropertyGraph

def func():
pg = PropertyGraph()
src = cp.arange(n_rows)
dst = src - 1
df = cudf.DataFrame({"src": src, "dst": dst})
pg.add_edge_data(df, ["src", "dst"], type_name="('_N', '_E', '_N')")

gpubenchmark(func)


# This test runs for *minutes* with the current implementation, and since
# benchmarking can call it multiple times per run, the overall time for this
# test can be ~20 minutes.
Expand Down