rapidsai · rapids-bot · Nov 15, 2022 · Nov 14, 2022 · Nov 15, 2022 · Nov 15, 2022
@@ -408,6 +408,8 @@ def add_vertex_data(
         tmp_df[self.vertex_col_name] = tmp_df[vertex_col_name]
         # FIXME: handle case of a type_name column already being in tmp_df
 
+        # FIXME: We should do categorization first
+        # Related issue: https://github.com/rapidsai/cugraph/issues/2903
         tmp_df[TCN] = type_name
         tmp_df[TCN] = tmp_df[TCN].astype(cat_dtype)
 
@@ -626,6 +628,9 @@ def add_edge_data(
         tmp_df[self.src_col_name] = tmp_df[vertex_col_names[0]]
         tmp_df[self.dst_col_name] = tmp_df[vertex_col_names[1]]
 
+        # FIXME: We should do categorization first
+        # Related issue: https://github.com/rapidsai/cugraph/issues/2903
+
         tmp_df[TCN] = type_name
         tmp_df[TCN] = tmp_df[TCN].astype(cat_dtype)
 

@@ -618,7 +618,8 @@ def add_vertex_data(
         if self.__series_type is cudf.Series:
             # cudf does not yet support initialization with a scalar
             tmp_df[TCN] = cudf.Series(
-                np.repeat(type_name, len(tmp_df)), index=tmp_df.index, dtype=cat_dtype
+                cudf.Series([type_name], dtype=cat_dtype).repeat(len(tmp_df)),
+                index=tmp_df.index,
             )
         else:
             # pandas is oddly slow if dtype is passed to the constructor here
@@ -909,7 +910,8 @@ def add_edge_data(
         if self.__series_type is cudf.Series:
             # cudf does not yet support initialization with a scalar
             tmp_df[TCN] = cudf.Series(
-                np.repeat(type_name, len(tmp_df)), index=tmp_df.index, dtype=cat_dtype
+                cudf.Series([type_name], dtype=cat_dtype).repeat(len(tmp_df)),
+                index=tmp_df.index,
             )
         else:
             # pandas is oddly slow if dtype is passed to the constructor here

@@ -18,6 +18,7 @@
 import pandas as pd
 import numpy as np
 import cudf
+import cupy as cp
 from cudf.testing import assert_frame_equal, assert_series_equal
 from cugraph.experimental.datasets import cyber
 
@@ -1965,6 +1966,20 @@ def bench_extract_subgraph_for_rmat(gpubenchmark, rmat_PropertyGraph):
     )
 
 
+@pytest.mark.parametrize("n_rows", [15_000_000, 30_000_000, 60_000_000, 120_000_000])
+def bench_add_edge_data(gpubenchmark, n_rows):
+    from cugraph.experimental import PropertyGraph
+
+    def func():
+        pg = PropertyGraph()
+        src = cp.arange(n_rows)
+        dst = src - 1
+        df = cudf.DataFrame({"src": src, "dst": dst})
+        pg.add_edge_data(df, ["src", "dst"], type_name="('_N', '_E', '_N')")
+
+    gpubenchmark(func)
+
+
 # This test runs for *minutes* with the current implementation, and since
 # benchmarking can call it multiple times per run, the overall time for this
 # test can be ~20 minutes.