Skip to content

Commit

Permalink
Optimize PG.add_data (#2924)
Browse files Browse the repository at this point in the history
This PR fixes #2903  . 

We reduce the memory foot print by  `3.5x` and speeds up the add_data by `557x` and also allows us to not be limited in the size of edges we can save.  (Time is in seconds vs ms) 

Before PR:
```python3
Name (time in s, mem in bytes)       Mean                  GPU mem            GPU Leaked mem            Rounds            GPU Rounds          
----------------------------------------------------------------------------------------------------------------------------------------------
bench_add_edge_data[15000000]      2.3044 (1.0)      2,160,000,064 (1.0)                   0 (1.0)           1           1
bench_add_edge_data[30000000]      4.7941 (2.08)     4,320,000,064 (2.00)                  0 (1.0)           1           1
bench_add_edge_data[60000000]      8.7235 (3.79)     8,640,000,064 (4.00)                  0 (1.0)           1           1
bench_add_edge_data[120000000]  FAILED
----------------------------------------------------------------------------------------------------------------------------------------------
```


After PR
```python
-------------------------------------------------------------- benchmark: 4 tests --------------------------------------------------------------
Name (time in ms, mem in bytes)        Mean                  GPU mem            GPU Leaked mem            Rounds            GPU Rounds          
------------------------------------------------------------------------------------------------------------------------------------------------
bench_add_edge_data[15000000]       16.3785 (1.0)        615,000,080 (1.0)                   0 (1.0)           1           1
bench_add_edge_data[30000000]       17.3631 (1.06)     1,230,000,080 (2.00)                  0 (1.0)           1           1
bench_add_edge_data[60000000]       22.2947 (1.36)     2,460,000,080 (4.00)                  0 (1.0)           1           1
bench_add_edge_data[120000000]      26.9747 (1.65)     4,920,000,080 (8.00)                  0 (1.0)           1           1
------------------------------------------------------------------------------------------------------------------------------------------------
```

Authors:
  - Vibhu Jawa (https://github.com/VibhuJawa)

Approvers:
  - Joseph Nke (https://github.com/jnke2016)
  - Brad Rees (https://github.com/BradReesWork)

URL: #2924
  • Loading branch information
VibhuJawa authored Nov 15, 2022
1 parent 85c447b commit 605e2b5
Show file tree
Hide file tree
Showing 3 changed files with 24 additions and 2 deletions.
5 changes: 5 additions & 0 deletions python/cugraph/cugraph/dask/structure/mg_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,6 +408,8 @@ def add_vertex_data(
tmp_df[self.vertex_col_name] = tmp_df[vertex_col_name]
# FIXME: handle case of a type_name column already being in tmp_df

# FIXME: We should do categorization first
# Related issue: https://github.com/rapidsai/cugraph/issues/2903
tmp_df[TCN] = type_name
tmp_df[TCN] = tmp_df[TCN].astype(cat_dtype)

Expand Down Expand Up @@ -626,6 +628,9 @@ def add_edge_data(
tmp_df[self.src_col_name] = tmp_df[vertex_col_names[0]]
tmp_df[self.dst_col_name] = tmp_df[vertex_col_names[1]]

# FIXME: We should do categorization first
# Related issue: https://github.com/rapidsai/cugraph/issues/2903

tmp_df[TCN] = type_name
tmp_df[TCN] = tmp_df[TCN].astype(cat_dtype)

Expand Down
6 changes: 4 additions & 2 deletions python/cugraph/cugraph/structure/property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,7 +618,8 @@ def add_vertex_data(
if self.__series_type is cudf.Series:
# cudf does not yet support initialization with a scalar
tmp_df[TCN] = cudf.Series(
np.repeat(type_name, len(tmp_df)), index=tmp_df.index, dtype=cat_dtype
cudf.Series([type_name], dtype=cat_dtype).repeat(len(tmp_df)),
index=tmp_df.index,
)
else:
# pandas is oddly slow if dtype is passed to the constructor here
Expand Down Expand Up @@ -909,7 +910,8 @@ def add_edge_data(
if self.__series_type is cudf.Series:
# cudf does not yet support initialization with a scalar
tmp_df[TCN] = cudf.Series(
np.repeat(type_name, len(tmp_df)), index=tmp_df.index, dtype=cat_dtype
cudf.Series([type_name], dtype=cat_dtype).repeat(len(tmp_df)),
index=tmp_df.index,
)
else:
# pandas is oddly slow if dtype is passed to the constructor here
Expand Down
15 changes: 15 additions & 0 deletions python/cugraph/cugraph/tests/test_property_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import pandas as pd
import numpy as np
import cudf
import cupy as cp
from cudf.testing import assert_frame_equal, assert_series_equal
from cugraph.experimental.datasets import cyber

Expand Down Expand Up @@ -1965,6 +1966,20 @@ def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
)


@pytest.mark.parametrize("n_rows", [15_000_000, 30_000_000, 60_000_000, 120_000_000])
def bench_add_edge_data(gpubenchmark, n_rows):
from cugraph.experimental import PropertyGraph

def func():
pg = PropertyGraph()
src = cp.arange(n_rows)
dst = src - 1
df = cudf.DataFrame({"src": src, "dst": dst})
pg.add_edge_data(df, ["src", "dst"], type_name="('_N', '_E', '_N')")

gpubenchmark(func)


# This test runs for *minutes* with the current implementation, and since
# benchmarking can call it multiple times per run, the overall time for this
# test can be ~20 minutes.
Expand Down

0 comments on commit 605e2b5

Please sign in to comment.