diff --git a/notebooks/algorithms/structure/Renumber-2.ipynb b/notebooks/algorithms/structure/Renumber-2.ipynb index c4143b6a70b..6a52632b38a 100755 --- a/notebooks/algorithms/structure/Renumber-2.ipynb +++ b/notebooks/algorithms/structure/Renumber-2.ipynb @@ -108,8 +108,8 @@ "outputs": [], "source": [ "# Since IP columns are strings, we first need to convert them to integers\n", - "gdf['src_ip'] = gdf['src'].str.ip2int()\n", - "gdf['dst_ip'] = gdf['dst'].str.ip2int()" + "gdf['src_ip'] = gdf['srcip'].str.ip2int()\n", + "gdf['dst_ip'] = gdf['dstip'].str.ip2int()" ] }, { @@ -253,4 +253,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} +} \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/__init__.py b/python/cugraph/cugraph/experimental/datasets/__init__.py index 5f355eb8cbc..3e797f3aca4 100644 --- a/python/cugraph/cugraph/experimental/datasets/__init__.py +++ b/python/cugraph/cugraph/experimental/datasets/__init__.py @@ -30,13 +30,31 @@ karate_data = Dataset(meta_path / "karate_data.yaml") karate_undirected = Dataset(meta_path / "karate_undirected.yaml") karate_asymmetric = Dataset(meta_path / "karate_asymmetric.yaml") +karate_disjoint = Dataset(meta_path / "karate-disjoint.yaml") dolphins = Dataset(meta_path / "dolphins.yaml") polbooks = Dataset(meta_path / "polbooks.yaml") netscience = Dataset(meta_path / "netscience.yaml") cyber = Dataset(meta_path / "cyber.yaml") small_line = Dataset(meta_path / "small_line.yaml") small_tree = Dataset(meta_path / "small_tree.yaml") +toy_graph = Dataset(meta_path / "toy_graph.yaml") +toy_graph_undirected = Dataset(meta_path / "toy_graph_undirected.yaml") +email_Eu_core = Dataset(meta_path / "email-Eu-core.yaml") +ktruss_polbooks = Dataset(meta_path / "ktruss_polbooks.yaml") +DATASETS_UNDIRECTED = [karate, dolphins] + +DATASETS_UNDIRECTED_WEIGHTS = [netscience] + +DATASETS_UNRENUMBERED = [karate_disjoint] + +DATASETS = [dolphins, netscience, karate_disjoint] + +DATASETS_SMALL = [karate, dolphins, polbooks] + +STRONGDATASETS = [dolphins, netscience, email_Eu_core] + +DATASETS_KTRUSS = [(polbooks, ktruss_polbooks)] MEDIUM_DATASETS = [polbooks] @@ -51,7 +69,3 @@ small_line, small_tree] TEST_GROUP = [dolphins, netscience] - -DATASETS_KTRUSS = [polbooks] - -DATASETS_UNDIRECTED = [karate_undirected, small_line, karate_asymmetric] \ No newline at end of file diff --git a/python/cugraph/cugraph/experimental/datasets/dataset.py b/python/cugraph/cugraph/experimental/datasets/dataset.py index f5595e1f354..a71cc48c13d 100644 --- a/python/cugraph/cugraph/experimental/datasets/dataset.py +++ b/python/cugraph/cugraph/experimental/datasets/dataset.py @@ -73,6 +73,10 @@ def __init__(self, meta_data_file_name): self._edgelist = None self._graph = None self._path = None + """ + self._path = self._dl_path.path / (self.metadata['name'] + + self.metadata['file_type']) + """ def __download_csv(self, url): self._dl_path.path.mkdir(parents=True, exist_ok=True) @@ -98,9 +102,7 @@ def get_edgelist(self, fetch=False): """ if self._edgelist is None: - full_path = self._dl_path.path / (self.metadata['name'] + - self.metadata['file_type']) - + full_path = self.get_path() if not full_path.is_file(): if fetch: self.__download_csv(self.metadata['url']) @@ -108,12 +110,14 @@ def get_edgelist(self, fetch=False): raise RuntimeError(f"The datafile {full_path} does not" " exist. Try get_edgelist(fetch=True)" " to download the datafile") - + header = None + if isinstance(self.metadata['header'], int): + header = self.metadata['header'] self._edgelist = cudf.read_csv(full_path, delimiter=self.metadata['delim'], names=self.metadata['col_names'], - dtype=self.metadata['col_types']) - self._path = full_path + dtype=self.metadata['col_types'], + header=header) return self._edgelist @@ -144,6 +148,7 @@ def get_graph(self, fetch=False, create_using=Graph, ignore_weights=False): if create_using is None: self._graph = Graph() elif isinstance(create_using, Graph): + # what about BFS if trnaposed is True attrs = {"directed": create_using.is_directed()} self._graph = type(create_using)(**attrs) elif type(create_using) is type: @@ -166,9 +171,8 @@ def get_path(self): """ Returns the location of the stored dataset file """ - if self._path is None: - raise RuntimeError("Path to datafile has not been set." + - " Call get_edgelist or get_graph first") + self._path = self._dl_path.path / (self.metadata['name'] + + self.metadata['file_type']) return self._path.absolute() diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml index 36858242ec7..93ab5345442 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/cyber.yaml @@ -5,17 +5,18 @@ url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/cy refs: N/A col_names: - idx - - src - - dst + - srcip + - dstip col_types: - int32 - str - str delim: "," +header: 0 has_loop: true is_directed: true is_multigraph: false is_symmetric: false -number_of_edges: 54 -number_of_nodes: 314 +number_of_edges: 2546575 +number_of_nodes: 706529 number_of_lines: 2546576 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml index ef07def2b97..e4951375321 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/dolphins.yaml @@ -15,6 +15,7 @@ col_types: - int32 - float32 delim: " " +header: None has_loop: false is_directed: true is_multigraph: false diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml new file mode 100644 index 00000000000..97d0dc82ee3 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/email-Eu-core.yaml @@ -0,0 +1,22 @@ +name: email-Eu-core +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/email-Eu-core.csv +refs: null +delim: " " +header: None +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 25571 +number_of_nodes: 1005 +number_of_lines: 25571 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml new file mode 100644 index 00000000000..0c0eaf78b63 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate-disjoint.yaml @@ -0,0 +1,22 @@ +name: karate-disjoint +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-disjoint.csv +refs: null +delim: " " +header: None +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: True +is_multigraph: false +is_symmetric: true +number_of_edges: 312 +number_of_nodes: 68 +number_of_lines: 312 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml index 9b7ac679e96..273381ed368 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate.yaml @@ -6,6 +6,7 @@ refs: W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977). delim: " " +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml index b4a81fde29e..3616b8fb3a5 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate_asymmetric.yaml @@ -2,10 +2,11 @@ name: karate-asymmetric file_type: .csv author: Zachary W. url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/karate-asymmetric.csv +delim: " " +header: None refs: W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977). -delim: "\t" col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml index d86c7b1a241..9a8b27f21ae 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate_data.yaml @@ -6,6 +6,7 @@ refs: W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977). delim: "\t" +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml index 061b3361367..1b45f86caee 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/karate_undirected.yaml @@ -6,6 +6,7 @@ refs: W. W. Zachary, An information flow model for conflict and fission in small groups, Journal of Anthropological Research 33, 452-473 (1977). delim: "\t" +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml new file mode 100644 index 00000000000..1ef29b3917e --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/ktruss_polbooks.yaml @@ -0,0 +1,23 @@ +name: ktruss_polbooks +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/ref/ktruss/polbooks.csv +refs: null +delim: " " +header: None +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: true +is_multigraph: false +is_symmetric: false +number_of_edges: 233 +number_of_nodes: 58 +number_of_lines: 233 + diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml index 9c3bd8a6a1d..2dca702df3d 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/netscience.yaml @@ -4,6 +4,7 @@ author: Newman, Mark EJ url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/netscience.csv refs: Finding community structure in networks using the eigenvectors of matrices. delim: " " +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml index 75e0e69565d..5816e5672fd 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/polbooks.yaml @@ -4,6 +4,7 @@ author: V. Krebs url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/polbooks.csv refs: null delim: " " +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml index 9831ff11b30..5b724ac99fd 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/small_line.yaml @@ -4,6 +4,7 @@ author: null url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_line.csv refs: null delim: " " +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml index 942f468c23b..8eeac346d2a 100644 --- a/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml +++ b/python/cugraph/cugraph/experimental/datasets/metadata/small_tree.yaml @@ -4,6 +4,7 @@ author: null url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/small_tree.csv refs: null delim: " " +header: None col_names: - src - dst diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml new file mode 100644 index 00000000000..819aad06f6a --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph.yaml @@ -0,0 +1,22 @@ +name: toy_graph +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/toy_graph.csv +refs: null +delim: " " +header: None +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 16 +number_of_nodes: 6 +number_of_lines: 16 diff --git a/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml new file mode 100644 index 00000000000..c6e86bdf334 --- /dev/null +++ b/python/cugraph/cugraph/experimental/datasets/metadata/toy_graph_undirected.yaml @@ -0,0 +1,22 @@ +name: toy_graph_undirected +file_type: .csv +author: null +url: https://raw.githubusercontent.com/rapidsai/cugraph/branch-22.08/datasets/toy_graph_undirected.csv +refs: null +delim: " " +header: None +col_names: + - src + - dst + - wgt +col_types: + - int32 + - int32 + - float32 +has_loop: false +is_directed: false +is_multigraph: false +is_symmetric: true +number_of_edges: 8 +number_of_nodes: 6 +number_of_lines: 8 diff --git a/python/cugraph/cugraph/tests/test_balanced_cut.py b/python/cugraph/cugraph/tests/test_balanced_cut.py index d4148fe4a31..0035ad83bcf 100644 --- a/python/cugraph/cugraph/tests/test_balanced_cut.py +++ b/python/cugraph/cugraph/tests/test_balanced_cut.py @@ -19,7 +19,7 @@ import pandas as pd import cudf import cugraph -from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS def cugraph_call(G, partitions): @@ -59,16 +59,13 @@ def random_call(G, partitions): # Test all combinations of default/managed and pooled/non-pooled allocation -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_edge_cut_clustering(graph_file, partitions): gc.collect() - # Read in the graph and get a cugraph object - cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) - - G_edge = cugraph.Graph() - G_edge.from_cudf_edgelist(cu_M, source="0", destination="1") + # read_weights_in_sp=True => value column dtype is float32 + G_edge = graph_file.get_graph(ignore_weights=True) # Get the edge_cut score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_edge, partitions) @@ -76,21 +73,22 @@ def test_edge_cut_clustering(graph_file, partitions): # Assert that the partitioning has better edge_cut than the random # assignment - print('graph_file = ', graph_file, ', partitions = ', partitions) + dataset_name = graph_file.metadata['name'] + print('graph_file = ', dataset_name, ', partitions = ', partitions) print(cu_score, rand_score) assert cu_score < rand_score -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_edge_cut_clustering_with_edgevals(graph_file, partitions): gc.collect() - # Read in the graph and get a cugraph object - cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) + G_edge = graph_file.get_graph() - G_edge = cugraph.Graph() - G_edge.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") + # read_weights_in_sp=False => value column dtype is float64 + G_edge.edgelist.edgelist_df['weights'] = \ + G_edge.edgelist.edgelist_df['weights'].astype("float64") # Get the edge_cut score for partitioning versus random assignment cu_vid, cu_score = cugraph_call(G_edge, partitions) @@ -124,14 +122,17 @@ def test_digraph_rejected(): cugraph_call(G, 2) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_edge_cut_clustering_with_edgevals_nx(graph_file, partitions): gc.collect() - # Read in the graph and create a NetworkX Graph - # FIXME: replace with utils.generate_nx_graph_from_file() - NM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + # G = cugraph.Graph() + # read_weights_in_sp=True => value column dtype is float32 + G = graph_file.get_graph() + NM = G.to_pandas_edgelist().rename( + columns={'src': '0', 'dst': '1', 'weights': 'weight'}) + G = nx.from_pandas_edgelist( NM, create_using=nx.Graph(), source="0", target="1", edge_attr="weight" diff --git a/python/cugraph/cugraph/tests/test_betweenness_centrality.py b/python/cugraph/cugraph/tests/test_betweenness_centrality.py index 6f879e40017..88efe0d2e22 100755 --- a/python/cugraph/cugraph/tests/test_betweenness_centrality.py +++ b/python/cugraph/cugraph/tests/test_betweenness_centrality.py @@ -23,6 +23,7 @@ import cupy import networkx as nx +from cugraph.experimental.datasets import DATASETS_SMALL, DATASETS_UNRENUMBERED # ============================================================================= @@ -106,8 +107,22 @@ def calc_betweenness_centrality( G = None Gnx = None - G, Gnx = utils.build_cu_and_nx_graphs(graph_file, directed=directed, - edgevals=edgevals) + if edgevals: + edge_attr = "weight" + else: + edge_attr = None + + G = graph_file.get_graph( + create_using=cugraph.Graph( + directed=directed), ignore_weights=not edgevals) + + M = G.to_pandas_edgelist().rename( + columns={'src': '0', 'dst': '1', 'weights': 'weight'}) + + Gnx = nx.from_pandas_edgelist( + M, source="0", target="1", edge_attr=edge_attr, + create_using=(nx.DiGraph() if directed else nx.Graph()) + ) assert G is not None and Gnx is not None if multi_gpu_batch: @@ -284,7 +299,7 @@ def compare_scores(sorted_df, first_key, second_key, epsilon=DEFAULT_EPSILON): # ============================================================================= # Tests # ============================================================================= -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -318,7 +333,7 @@ def test_betweenness_centrality( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", [None]) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -362,7 +377,7 @@ def test_betweenness_centrality_k_full( # the function operating the comparison inside is first proceeding # to a random sampling over the number of vertices (thus direct offsets) # in the graph structure instead of actual vertices identifiers -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED) +@pytest.mark.parametrize("graph_file", DATASETS_UNRENUMBERED) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -400,7 +415,7 @@ def test_betweenness_centrality_fixed_sample( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -440,7 +455,7 @@ def test_betweenness_centrality_weight_except( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @@ -477,6 +492,7 @@ def test_betweenness_invalid_dtype( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") +# FIXME: update the datasets API to return Nx graph @pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) diff --git a/python/cugraph/cugraph/tests/test_bfs.py b/python/cugraph/cugraph/tests/test_bfs.py index 0009d4b5250..dee71a5dd8f 100644 --- a/python/cugraph/cugraph/tests/test_bfs.py +++ b/python/cugraph/cugraph/tests/test_bfs.py @@ -29,6 +29,7 @@ from scipy.sparse import coo_matrix as sp_coo_matrix from scipy.sparse import csr_matrix as sp_csr_matrix from scipy.sparse import csc_matrix as sp_csc_matrix +from cugraph.experimental import datasets # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -256,15 +257,20 @@ def _compare_bfs(cugraph_df, nx_distances, source): assert invalid_predecessor_error == 0, "There are invalid predecessors" -def get_nx_graph_and_params(dataset, directed): +def get_cu_graph_nx_graph_and_params(dataset, directed): """ Helper for fixtures returning a Nx graph obj and params. """ - return (dataset, directed, - utils.generate_nx_graph_from_file(dataset, directed)) + # create graph + G = dataset.get_graph(create_using=cugraph.Graph(directed=directed)) + dataset_path = dataset.get_path() + return (G, dataset_path, directed, + utils.generate_nx_graph_from_file(dataset_path, directed)) -def get_nx_results_and_params(seed, depth_limit, dataset, directed, Gnx): + +def get_cu_graph_nx_results_and_params( + seed, depth_limit, G, dataset, directed, Gnx): """ Helper for fixtures returning Nx results and params. """ @@ -274,7 +280,7 @@ def get_nx_results_and_params(seed, depth_limit, dataset, directed, Gnx): nx_values = nx.single_source_shortest_path_length(Gnx, start_vertex, cutoff=depth_limit) - return (dataset, directed, nx_values, start_vertex, depth_limit) + return (G, dataset, directed, nx_values, start_vertex, depth_limit) # ============================================================================= @@ -282,8 +288,8 @@ def get_nx_results_and_params(seed, depth_limit, dataset, directed, Gnx): # ============================================================================= SEEDS = [pytest.param(s) for s in SUBSET_SEED_OPTIONS] DIRECTED = [pytest.param(d) for d in DIRECTED_GRAPH_OPTIONS] -DATASETS = [pytest.param(d) for d in utils.DATASETS] -DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] +DATASETS = [pytest.param(d) for d in datasets.DATASETS] +DATASETS_SMALL = [pytest.param(d) for d in datasets.DATASETS_SMALL] DEPTH_LIMIT = [pytest.param(d) for d in DEPTH_LIMITS] # Call genFixtureParamsProduct() to caluculate the cartesian product of @@ -320,17 +326,17 @@ def get_nx_results_and_params(seed, depth_limit, dataset, directed, Gnx): # directed graph is being used, and the Nx graph object. @pytest.fixture(scope="module", params=graph_fixture_params) def dataset_nx_graph(request): - return get_nx_graph_and_params(*request.param) + return get_cu_graph_nx_graph_and_params(*request.param) @pytest.fixture(scope="module", params=small_graph_fixture_params) def small_dataset_nx_graph(request): - return get_nx_graph_and_params(*request.param) + return get_cu_graph_nx_graph_and_params(*request.param) @pytest.fixture(scope="module", params=single_small_graph_fixture_params) def single_small_dataset_nx_graph(request): - return get_nx_graph_and_params(*request.param) + return get_cu_graph_nx_graph_and_params(*request.param) # Fixtures that result in a test-per (dataset_nx_graph combinations X algo_test @@ -340,14 +346,15 @@ def single_small_dataset_nx_graph(request): # used. @pytest.fixture(scope="module", params=algo_test_fixture_params) def dataset_nxresults_startvertex_spc(dataset_nx_graph, request): - return get_nx_results_and_params(*request.param, *dataset_nx_graph) + return get_cu_graph_nx_results_and_params( + *request.param, *dataset_nx_graph) @pytest.fixture(scope="module", params=single_algo_test_fixture_params) def single_dataset_nxresults_startvertex_spc(single_small_dataset_nx_graph, request): - return get_nx_results_and_params(*request.param, - *single_small_dataset_nx_graph) + return get_cu_graph_nx_results_and_params(*request.param, + *single_small_dataset_nx_graph) @pytest.fixture(scope="module") @@ -377,7 +384,7 @@ def test_bfs(gpubenchmark, dataset_nxresults_startvertex_spc, """ Test BFS traversal on random source with distance and predecessors """ - (dataset, directed, nx_values, start_vertex, depth_limit) = \ + (G, dataset, directed, nx_values, start_vertex, depth_limit) = \ dataset_nxresults_startvertex_spc # special case: ensure cugraph and Nx Graph types are DiGraphs if @@ -390,7 +397,10 @@ def test_bfs(gpubenchmark, dataset_nxresults_startvertex_spc, elif cugraph_input_type is nx.Graph: cugraph_input_type = nx.DiGraph - G_or_matrix = utils.create_obj_from_csv(dataset, cugraph_input_type) + if not isinstance(cugraph_input_type, (cugraph.Graph, cugraph.DiGraph)): + G_or_matrix = utils.create_obj_from_csv(dataset, cugraph_input_type) + else: + G_or_matrix = G compare_bfs( gpubenchmark, @@ -411,37 +421,25 @@ def test_bfs_nonnative_inputs(gpubenchmark, def test_bfs_invalid_start(gpubenchmark, dataset_nxresults_startvertex_spc, cugraph_input_type): - (dataset, directed, nx_values, start_vertex, depth_limit) = \ + (G, dataset, directed, nx_values, start_vertex, depth_limit) = \ dataset_nxresults_startvertex_spc - # renumber the dataset so that start vertex is no longer a valid vertex - el = cudf.read_csv( - dataset, - sep=' ', - names=['src', 'tar', 'w'], - dtype=['int32', 'int32', 'float32'] - ).dropna() - newval = max(el.src.max(), el.tar.max()) + 1 - el.src = el.src.replace(start_vertex, newval) - el.tar = el.tar.replace(start_vertex, newval) - G = cugraph.from_cudf_edgelist( - el, - source='src', - destination='tar', - edge_attr='w', - renumber=True - ) + el = G.view_edge_list() + + newval = max(el.src.max(), el.dst.max()) + 1 + start_vertex = newval with pytest.raises(ValueError): cugraph.bfs(G, start_vertex, depth_limit=depth_limit) def test_scipy_api_compat(): - graph_file = utils.DATASETS[0] + graph_file = datasets.DATASETS[0] + dataset_path = graph_file.get_path() + + input_cugraph_graph = graph_file.get_graph(ignore_weights=True) - input_cugraph_graph = utils.create_obj_from_csv(graph_file, cugraph.Graph, - edgevals=True) - input_coo_matrix = utils.create_obj_from_csv(graph_file, cp_coo_matrix, + input_coo_matrix = utils.create_obj_from_csv(dataset_path, cp_coo_matrix, edgevals=True) # Ensure scipy-only options are rejected for cugraph inputs with pytest.raises(TypeError): diff --git a/python/cugraph/cugraph/tests/test_connectivity.py b/python/cugraph/cugraph/tests/test_connectivity.py index 6d31c39c447..eed215398ba 100644 --- a/python/cugraph/cugraph/tests/test_connectivity.py +++ b/python/cugraph/cugraph/tests/test_connectivity.py @@ -24,6 +24,7 @@ from scipy.sparse import coo_matrix as sp_coo_matrix from scipy.sparse import csr_matrix as sp_csr_matrix from scipy.sparse import csc_matrix as sp_csc_matrix +from cugraph.experimental.datasets import DATASETS, STRONGDATASETS import cudf import cugraph @@ -70,7 +71,9 @@ def setup_function(): # Helper functions # ============================================================================= def networkx_weak_call(graph_file): - M = utils.read_csv_for_nx(graph_file) + G = graph_file.get_graph() + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.DiGraph() ) @@ -85,12 +88,14 @@ def networkx_weak_call(graph_file): nx_n_components = len(nx_labels) lst_nx_components = sorted(nx_labels, key=len, reverse=True) - return (graph_file, nx_labels, nx_n_components, + return (G, dataset_path, nx_labels, nx_n_components, lst_nx_components, "weak") def networkx_strong_call(graph_file): - M = utils.read_csv_for_nx(graph_file) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.DiGraph() ) @@ -104,7 +109,7 @@ def networkx_strong_call(graph_file): nx_n_components = len(nx_labels) lst_nx_components = sorted(nx_labels, key=len, reverse=True) - return (graph_file, nx_labels, nx_n_components, + return (G, dataset_path, nx_labels, nx_n_components, lst_nx_components, "strong") @@ -194,7 +199,7 @@ def which_cluster_idx(_cluster, _find_vertex): return idx -def assert_scipy_api_compat(graph_file, api_type): +def assert_scipy_api_compat(G, dataset_path, api_type): """ Ensure cugraph.scc() and cugraph.connected_components() can be used as drop-in replacements for scipy.connected_components(): @@ -237,9 +242,8 @@ def assert_scipy_api_compat(graph_file, api_type): wrong_connection = {"strong": "weak", "weak": "strong"}[api_type] - input_cugraph_graph = utils.create_obj_from_csv(graph_file, cugraph.Graph, - edgevals=True) - input_coo_matrix = utils.create_obj_from_csv(graph_file, cp_coo_matrix, + input_cugraph_graph = G + input_coo_matrix = utils.create_obj_from_csv(dataset_path, cp_coo_matrix, edgevals=True) # Ensure scipy-only options are rejected for cugraph inputs @@ -266,22 +270,22 @@ def assert_scipy_api_compat(graph_file, api_type): # ============================================================================= # Pytest fixtures # ============================================================================= -@pytest.fixture(scope="module", params=utils.DATASETS) +@pytest.fixture(scope="module", params=DATASETS) def dataset_nxresults_weak(request): return networkx_weak_call(request.param) -@pytest.fixture(scope="module", params=[utils.DATASETS[0]]) +@pytest.fixture(scope="module", params=[DATASETS[0]]) def single_dataset_nxresults_weak(request): return networkx_weak_call(request.param) -@pytest.fixture(scope="module", params=utils.STRONGDATASETS) +@pytest.fixture(scope="module", params=STRONGDATASETS) def dataset_nxresults_strong(request): return networkx_strong_call(request.param) -@pytest.fixture(scope="module", params=[utils.STRONGDATASETS[0]]) +@pytest.fixture(scope="module", params=[STRONGDATASETS[0]]) def single_dataset_nxresults_strong(request): return networkx_strong_call(request.param) @@ -291,12 +295,15 @@ def single_dataset_nxresults_strong(request): # ============================================================================= @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES) def test_weak_cc(gpubenchmark, dataset_nxresults_weak, cugraph_input_type): - (graph_file, netx_labels, + (G, dataset_path, netx_labels, nx_n_components, lst_nx_components, api_type) = dataset_nxresults_weak - input_G_or_matrix = utils.create_obj_from_csv(graph_file, - cugraph_input_type, - edgevals=True) + if not isinstance(cugraph_input_type, (cugraph.Graph, cugraph.DiGraph)): + input_G_or_matrix = utils.create_obj_from_csv(dataset_path, + cugraph_input_type, + edgevals=True) + else: + input_G_or_matrix = G cugraph_labels = cugraph_call(gpubenchmark, cugraph.weakly_connected_components, input_G_or_matrix) @@ -344,12 +351,15 @@ def test_strong_cc(gpubenchmark, dataset_nxresults_strong, # NetX returns a list of components, each component being a # collection (set{}) of vertex indices - (graph_file, netx_labels, + (G, dataset_path, netx_labels, nx_n_components, lst_nx_components, api_type) = dataset_nxresults_strong - input_G_or_matrix = utils.create_obj_from_csv(graph_file, - cugraph_input_type, - edgevals=True) + if not isinstance(cugraph_input_type, (cugraph.Graph, cugraph.DiGraph)): + input_G_or_matrix = utils.create_obj_from_csv(dataset_path, + cugraph_input_type, + edgevals=True) + else: + input_G_or_matrix = G cugraph_labels = cugraph_call(gpubenchmark, cugraph.strongly_connected_components, input_G_or_matrix) @@ -396,25 +406,27 @@ def test_strong_cc_nonnative_inputs(gpubenchmark, def test_scipy_api_compat_weak(single_dataset_nxresults_weak): - (graph_file, _, _, _, api_type) = single_dataset_nxresults_weak - assert_scipy_api_compat(graph_file, api_type) + (G, dataset_path, _, _, _, api_type) = single_dataset_nxresults_weak + assert_scipy_api_compat(G, dataset_path, api_type) def test_scipy_api_compat_strong(single_dataset_nxresults_strong): - (graph_file, _, _, _, api_type) = single_dataset_nxresults_strong - assert_scipy_api_compat(graph_file, api_type) + (G, dataset_path, _, _, _, api_type) = single_dataset_nxresults_strong + assert_scipy_api_compat(G, dataset_path, api_type) @pytest.mark.parametrize("connection_type", ["strong", "weak"]) def test_scipy_api_compat(connection_type): if connection_type == "strong": - graph_file = utils.STRONGDATASETS[0] + graph_file = STRONGDATASETS[0] else: - graph_file = utils.DATASETS[0] + graph_file = DATASETS[0] + + input_cugraph_graph = graph_file.get_graph() + + dataset_path = graph_file.get_path() - input_cugraph_graph = utils.create_obj_from_csv(graph_file, cugraph.Graph, - edgevals=True) - input_coo_matrix = utils.create_obj_from_csv(graph_file, cp_coo_matrix, + input_coo_matrix = utils.create_obj_from_csv(dataset_path, cp_coo_matrix, edgevals=True) # connection is the only API that is accepted with cugraph objs diff --git a/python/cugraph/cugraph/tests/test_core_number.py b/python/cugraph/cugraph/tests/test_core_number.py index 49b2f76664e..954d6975a7e 100644 --- a/python/cugraph/cugraph/tests/test_core_number.py +++ b/python/cugraph/cugraph/tests/test_core_number.py @@ -19,6 +19,7 @@ import cugraph import networkx as nx from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED # ============================================================================= @@ -31,7 +32,7 @@ def setup_function(): # ============================================================================= # Pytest fixtures # ============================================================================= -datasets = utils.DATASETS_UNDIRECTED +datasets = DATASETS_UNDIRECTED degree_type = ["incoming", "outgoing"] fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"), @@ -48,10 +49,9 @@ def input_combo(request): parameters = dict( zip(("graph_file", "degree_type"), request.param)) - input_data_path = parameters["graph_file"] - - G = utils.generate_cugraph_graph_from_file( - input_data_path, directed=False, edgevals=True) + graph_file = parameters["graph_file"] + G = graph_file.get_graph() + input_data_path = graph_file.get_path() Gnx = utils.generate_nx_graph_from_file( input_data_path, directed=False, edgevals=True) diff --git a/python/cugraph/cugraph/tests/test_dataset.py b/python/cugraph/cugraph/tests/test_dataset.py index 9d9078af9d1..e814d65266d 100644 --- a/python/cugraph/cugraph/tests/test_dataset.py +++ b/python/cugraph/cugraph/tests/test_dataset.py @@ -161,13 +161,6 @@ def test_get_path(dataset, datasets): tmpd.cleanup() -# Path is None until a dataset initializes its edgelist -@pytest.mark.parametrize("dataset", ALL_DATASETS) -def test_get_path_raises(dataset): - with pytest.raises(RuntimeError): - dataset.get_path() - - @pytest.mark.parametrize("dataset", ALL_DATASETS_WGT) def test_weights(dataset, datasets): datasets.set_download_dir(dataset_path) diff --git a/python/cugraph/cugraph/tests/test_degree_centrality.py b/python/cugraph/cugraph/tests/test_degree_centrality.py index f2475a244f3..b1ed4d21c2e 100644 --- a/python/cugraph/cugraph/tests/test_degree_centrality.py +++ b/python/cugraph/cugraph/tests/test_degree_centrality.py @@ -18,6 +18,7 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED import networkx as nx @@ -35,10 +36,10 @@ def topKVertices(degree, col, k): return top["vertex"] -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_degree_centrality_nx(graph_file): - NM = utils.read_csv_for_nx(graph_file) - + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1", ) @@ -66,9 +67,10 @@ def test_degree_centrality_nx(graph_file): assert err < (0.1 * len(ck)) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_degree_centrality_multi_column(graph_file): - cu_M = utils.read_csv_file(graph_file) + dataset_path = graph_file.get_path() + cu_M = utils.read_csv_file(dataset_path) cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) cu_M['src_1'] = cu_M['src_0'] + 1000 cu_M['dst_1'] = cu_M['dst_0'] + 1000 diff --git a/python/cugraph/cugraph/tests/test_ecg.py b/python/cugraph/cugraph/tests/test_ecg.py index c7ec0d51489..3bc3b0d3266 100644 --- a/python/cugraph/cugraph/tests/test_ecg.py +++ b/python/cugraph/cugraph/tests/test_ecg.py @@ -18,6 +18,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import karate, dolphins, netscience from pathlib import PurePath @@ -44,10 +45,7 @@ def golden_call(graph_file): return 0.9279554486274719 -DATASETS = [ - PurePath(utils.RAPIDS_DATASET_ROOT_DIR) / f - for f in ["karate.csv", "dolphins.csv", "netscience.csv"] -] +DATASETS = [karate, dolphins, netscience] MIN_WEIGHTS = [0.05, 0.10, 0.15] @@ -61,13 +59,16 @@ def test_ecg_clustering(graph_file, min_weight, ensemble_size): gc.collect() # Read in the graph and get a cugraph object - cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") + + G = graph_file.get_graph() + dataset_path = graph_file.get_path() + # read_weights_in_sp=False => value column dtype is float64 + G.edgelist.edgelist_df['weights'] = \ + G.edgelist.edgelist_df['weights'].astype("float64") # Get the modularity score for partitioning versus random assignment cu_score, num_parts = cugraph_call(G, min_weight, ensemble_size) - golden_score = golden_call(graph_file) + golden_score = golden_call(dataset_path) # Assert that the partitioning has better modularity than the random # assignment @@ -79,9 +80,9 @@ def test_ecg_clustering(graph_file, min_weight, ensemble_size): @pytest.mark.parametrize("ensemble_size", ENSEMBLE_SIZES) def test_ecg_clustering_nx(graph_file, min_weight, ensemble_size): gc.collect() - + dataset_path = graph_file.get_path() # Read in the graph and get a NetworkX graph - M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) G = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) diff --git a/python/cugraph/cugraph/tests/test_edge_betweenness_centrality.py b/python/cugraph/cugraph/tests/test_edge_betweenness_centrality.py index 0ae55e76bee..db2cb0686ac 100644 --- a/python/cugraph/cugraph/tests/test_edge_betweenness_centrality.py +++ b/python/cugraph/cugraph/tests/test_edge_betweenness_centrality.py @@ -17,6 +17,8 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import ( + DATASETS_SMALL, DATASETS_UNRENUMBERED) import random import numpy as np import cupy @@ -122,8 +124,14 @@ def calc_edge_betweenness_centrality( """ G = None Gnx = None - G, Gnx = utils.build_cu_and_nx_graphs(graph_file, directed=directed, - edgevals=edgevals) + dataset_path = graph_file.get_path() + Gnx = utils.generate_nx_graph_from_file( + dataset_path, directed=directed, edgevals=edgevals) + + G = graph_file.get_graph( + create_using=cugraph.Graph( + directed=directed), ignore_weights=not edgevals) + assert G is not None and Gnx is not None if multi_gpu_batch: G.enable_batch() @@ -303,7 +311,7 @@ def generate_upper_triangle(dataframe): return dataframe -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -334,7 +342,7 @@ def test_edge_betweenness_centrality( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", [None]) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -375,7 +383,7 @@ def test_edge_betweenness_centrality_k_full( # the function operating the comparison inside is first proceeding # to a random sampling over the number of vertices (thus direct offsets) # in the graph structure instead of actual vertices identifiers -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNRENUMBERED) +@pytest.mark.parametrize("graph_file", DATASETS_UNRENUMBERED) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -411,7 +419,7 @@ def test_edge_betweenness_centrality_fixed_sample( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @@ -449,7 +457,7 @@ def test_edge_betweenness_centrality_weight_except( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("normalized", NORMALIZED_OPTIONS) @pytest.mark.parametrize("subset_size", SUBSET_SIZE_OPTIONS) @@ -483,7 +491,7 @@ def test_edge_betweenness_invalid_dtype( compare_scores(sorted_df, first_key="cu_bc", second_key="ref_bc") -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("edgevals", WEIGHTED_GRAPH_OPTIONS) def test_edge_betweenness_centrality_nx( @@ -491,7 +499,8 @@ def test_edge_betweenness_centrality_nx( directed, edgevals ): - Gnx = utils.generate_nx_graph_from_file(graph_file, directed, edgevals) + dataset_path = graph_file.get_path() + Gnx = utils.generate_nx_graph_from_file(dataset_path, directed, edgevals) assert nx.is_directed(Gnx) == directed nx_bc = nx.edge_betweenness_centrality(Gnx) diff --git a/python/cugraph/cugraph/tests/test_egonet.py b/python/cugraph/cugraph/tests/test_egonet.py index ad434d7e393..4513286d1a9 100644 --- a/python/cugraph/cugraph/tests/test_egonet.py +++ b/python/cugraph/cugraph/tests/test_egonet.py @@ -18,6 +18,7 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -36,14 +37,15 @@ RADIUS = [1, 2, 3] -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("radius", RADIUS) def test_ego_graph_nx(graph_file, seed, radius): gc.collect() # Nx - df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + df = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight" ) @@ -55,14 +57,15 @@ def test_ego_graph_nx(graph_file, seed, radius): assert nx.is_isomorphic(ego_nx, ego_cugraph) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("seeds", [[0, 5, 13]]) @pytest.mark.parametrize("radius", [1, 2, 3]) def test_batched_ego_graphs(graph_file, seeds, radius): gc.collect() # Nx - df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + df = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight" ) @@ -78,13 +81,14 @@ def test_batched_ego_graphs(graph_file, seeds, radius): assert nx.is_isomorphic(ego_nx, ego_cugraph) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("seed", SEEDS) @pytest.mark.parametrize("radius", RADIUS) def test_multi_column_ego_graph(graph_file, seed, radius): gc.collect() - df = utils.read_csv_file(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + df = utils.read_csv_file(dataset_path, read_weights_in_sp=True) df.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) df['src_1'] = df['src_0'] + 1000 df['dst_1'] = df['dst_0'] + 1000 diff --git a/python/cugraph/cugraph/tests/test_eigenvector_centrality.py b/python/cugraph/cugraph/tests/test_eigenvector_centrality.py index d6c790d8d5c..b637c5499a3 100644 --- a/python/cugraph/cugraph/tests/test_eigenvector_centrality.py +++ b/python/cugraph/cugraph/tests/test_eigenvector_centrality.py @@ -15,14 +15,15 @@ import pytest -import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import ( + toy_graph, karate, DATASETS_UNDIRECTED, DATASETS) import networkx as nx # This toy graph is used in multiple tests throughout libcugraph_c and pylib. -TOY = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"toy_graph.csv" +TOY = toy_graph # ============================================================================= @@ -39,15 +40,14 @@ def topKVertices(eigen, col, k): def calc_eigenvector(graph_file): - cu_M = utils.read_csv_file(graph_file) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", store_transposed=True) + dataset_path = graph_file.get_path() + G = graph_file.get_graph(create_using=cugraph.Graph( + directed=True), ignore_weights=True) k_df = cugraph.eigenvector_centrality(G, max_iter=1000) k_df = k_df.sort_values("vertex").reset_index(drop=True) - NM = utils.read_csv_for_nx(graph_file) + NM = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1" ) @@ -59,7 +59,7 @@ def calc_eigenvector(graph_file): return k_df -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_eigenvector_centrality(graph_file): eigen_scores = calc_eigenvector(graph_file) @@ -69,10 +69,10 @@ def test_eigenvector_centrality(graph_file): assert topKNX.equals(topKCU) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_eigenvector_centrality_nx(graph_file): - - NM = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1", @@ -136,11 +136,7 @@ def test_eigenvector_centrality_multi_column(graph_file): @pytest.mark.parametrize("graph_file", [TOY]) def test_eigenvector_centrality_toy(graph_file): # This test is based off of libcugraph_c and pylibcugraph tests - df = cudf.read_csv(graph_file, delimiter=' ', - dtype=['int32', 'int32', 'float32'], header=None) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - df, source='0', destination='1', edge_attr='2', store_transposed=True) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) tol = 1e-6 max_iter = 200 @@ -158,14 +154,7 @@ def test_eigenvector_centrality_toy(graph_file): def test_eigenvector_centrality_transposed_false(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate.csv").as_posix() - cu_M = utils.read_csv_file(input_data_path) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", - legacy_renum_only=True, store_transposed=False) - + G = karate.get_graph(create_using=cugraph.Graph(directed=True)) warning_msg = ("Eigenvector centrality expects the 'store_transposed' " "flag to be set to 'True' for optimal performance during " "the graph creation") diff --git a/python/cugraph/cugraph/tests/test_filter_unreachable.py b/python/cugraph/cugraph/tests/test_filter_unreachable.py index 5f00775f43d..85d2ddd0767 100644 --- a/python/cugraph/cugraph/tests/test_filter_unreachable.py +++ b/python/cugraph/cugraph/tests/test_filter_unreachable.py @@ -17,7 +17,7 @@ import numpy as np import cugraph -from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -26,6 +26,14 @@ # third-party group once this gets fixed. import warnings + +# ============================================================================= +# Pytest Setup / Teardown - called for each test function +# ============================================================================= +def setup_function(): + gc.collect() + + with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=DeprecationWarning) import networkx as nx @@ -36,20 +44,15 @@ SOURCES = [1] -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("source", SOURCES) def test_filter_unreachable(graph_file, source): - gc.collect() - - cu_M = utils.read_csv_file(graph_file) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) + cu_M = G.view_edge_list() print("sources size = " + str(len(cu_M))) print("destinations size = " + str(len(cu_M))) - # cugraph Pagerank Call - G = cugraph.DiGraph() - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - print("cugraph Solving... ") t1 = time.time() diff --git a/python/cugraph/cugraph/tests/test_force_atlas2.py b/python/cugraph/cugraph/tests/test_force_atlas2.py index dec99be17fe..0ca91d26768 100644 --- a/python/cugraph/cugraph/tests/test_force_atlas2.py +++ b/python/cugraph/cugraph/tests/test_force_atlas2.py @@ -17,10 +17,10 @@ import cudf import cugraph from cugraph.internals import GraphBasedDimRedCallback -from cugraph.testing import utils from sklearn.manifold import trustworthiness import scipy.io -from pathlib import PurePath +from cugraph.experimental.datasets import ( + karate, polbooks, dolphins, netscience) # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -29,17 +29,17 @@ # relocated in the third-party group once this gets fixed. -def cugraph_call(cu_M, max_iter, pos_list, outbound_attraction_distribution, +def cugraph_call(cu_M, max_iter, pos_list, + outbound_attraction_distribution, lin_log_mode, prevent_overlapping, edge_weight_influence, jitter_tolerance, barnes_hut_theta, barnes_hut_optimize, scaling_ratio, strong_gravity_mode, gravity, callback=None): G = cugraph.Graph() G.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", renumber=False + cu_M, source="src", destination="dst", edge_attr="wgt", renumber=False ) - # cugraph Force Atlas 2 Call t1 = time.time() pos = cugraph.force_atlas2( G, @@ -62,11 +62,10 @@ def cugraph_call(cu_M, max_iter, pos_list, outbound_attraction_distribution, DATASETS = [ - (PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/f,)+(d,) for (f, d) in [ - ("karate.csv", 0.70), - ("polbooks.csv", 0.75), - ("dolphins.csv", 0.66), - ("netscience.csv", 0.66)] + (karate, 0.70), + (polbooks, 0.75), + (dolphins, 0.66), + (netscience, 0.66) ] @@ -96,7 +95,8 @@ def on_train_end(self, positions): @pytest.mark.parametrize('barnes_hut_optimize', BARNES_HUT_OPTIMIZE) def test_force_atlas2(graph_file, score, max_iter, barnes_hut_optimize): - cu_M = utils.read_csv_file(graph_file) + cu_M = graph_file.get_edgelist() + dataset_path = graph_file.get_path() test_callback = TestCallback() cu_pos = cugraph_call(cu_M, max_iter=max_iter, @@ -124,7 +124,7 @@ def test_force_atlas2(graph_file, score, max_iter, iterations on a given graph. """ - matrix_file = graph_file.with_suffix(".mtx") + matrix_file = dataset_path.with_suffix(".mtx") M = scipy.io.mmread(matrix_file) M = M.todense() cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) @@ -146,7 +146,8 @@ def test_force_atlas2(graph_file, score, max_iter, @pytest.mark.parametrize('barnes_hut_optimize', BARNES_HUT_OPTIMIZE) def test_force_atlas2_multi_column_pos_list(graph_file, score, max_iter, barnes_hut_optimize): - cu_M = utils.read_csv_file(graph_file) + cu_M = graph_file.get_edgelist() + dataset_path = graph_file.get_path() test_callback = TestCallback() pos = cugraph_call(cu_M, max_iter=max_iter, @@ -197,7 +198,7 @@ def test_force_atlas2_multi_column_pos_list(graph_file, score, max_iter, callback=test_callback) cu_pos = cu_pos.sort_values('0_vertex') - matrix_file = graph_file.with_suffix(".mtx") + matrix_file = dataset_path.with_suffix(".mtx") M = scipy.io.mmread(matrix_file) M = M.todense() cu_trust = trustworthiness(M, cu_pos[["x", "y"]].to_pandas()) diff --git a/python/cugraph/cugraph/tests/test_graph_store.py b/python/cugraph/cugraph/tests/test_graph_store.py index 6514b53fac2..7056fab3c98 100644 --- a/python/cugraph/cugraph/tests/test_graph_store.py +++ b/python/cugraph/cugraph/tests/test_graph_store.py @@ -14,44 +14,33 @@ from collections import defaultdict import pytest import cugraph -from cugraph.testing import utils from cugraph.experimental import PropertyGraph import numpy as np import cudf import cupy as cp from cugraph.gnn import CuGraphStore +from cugraph.experimental.datasets import DATASETS -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_no_graph(graph_file): with pytest.raises(TypeError): gstore = cugraph.gnn.CuGraphStore() gstore.num_edges() -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_using_graph(graph_file): with pytest.raises(ValueError): - - cu_M = utils.read_csv_file(graph_file) - - g = cugraph.Graph() - g.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", renumber=True - ) - + g = graph_file.get_graph() cugraph.gnn.CuGraphStore(graph=g) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_using_pgraph(graph_file): - cu_M = utils.read_csv_file(graph_file) - - g = cugraph.Graph(directed=True) - g.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", renumber=True - ) - + g = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pG = PropertyGraph() pG.add_edge_data(cu_M, vertex_col_names=("0", "1"), property_columns=None) @@ -63,11 +52,10 @@ def test_using_pgraph(graph_file): assert g.number_of_vertices() == gstore.num_vertices -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_node_data_pg(graph_file): - - cu_M = utils.read_csv_file(graph_file) - + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pG = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pG, backend_lib="cupy") gstore.add_edge_data( @@ -81,16 +69,14 @@ def test_node_data_pg(graph_file): @pytest.mark.skip("Skipping egonet testing for now") -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_egonet(graph_file): from cugraph.community.egonet import batched_ego_graphs - cu_M = utils.read_csv_file(graph_file) - - g = cugraph.Graph(directed=True) - g.from_cudf_edgelist(cu_M, source="0", destination="1", renumber=True) - + g = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pG = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pG, backend_lib="cupy") gstore.add_edge_data( @@ -107,15 +93,11 @@ def test_egonet(graph_file): @pytest.mark.skip("Skipping egonet testing for now") -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_workflow(graph_file): # from cugraph.community.egonet import batched_ego_graphs - - cu_M = utils.read_csv_file(graph_file) - - g = cugraph.Graph(directed=True) - g.from_cudf_edgelist(cu_M, source="0", destination="1", renumber=True) - + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pg = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pg) gstore.add_edge_data(cu_M, vertex_col_names=("0", "1"), feat_name="feat") @@ -131,13 +113,10 @@ def test_workflow(graph_file): assert len(ego_edge_list) > 0 -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_sample_neighbors(graph_file): - cu_M = utils.read_csv_file(graph_file) - - g = cugraph.Graph(directed=True) - g.from_cudf_edgelist(cu_M, source="0", destination="1", renumber=True) - + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pg = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pg) gstore.add_edge_data(cu_M, feat_name="feat", vertex_col_names=("0", "1")) @@ -157,13 +136,10 @@ def test_sample_neighbors(graph_file): assert len(parents_list) > 0 -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_sample_neighbor_neg_one_fanout(graph_file): - cu_M = utils.read_csv_file(graph_file) - - g = cugraph.Graph(directed=True) - g.from_cudf_edgelist(cu_M, source="0", destination="1", renumber=True) - + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pg = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pg) gstore.add_edge_data(cu_M, feat_name="edge_k", vertex_col_names=("0", "1")) @@ -178,9 +154,10 @@ def test_sample_neighbor_neg_one_fanout(graph_file): assert len(parents_list) > 0 -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_get_node_storage_graph_file(graph_file): - cu_M = utils.read_csv_file(graph_file) + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pg = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pg, backend_lib="cupy") @@ -208,10 +185,10 @@ def test_get_node_storage_graph_file(graph_file): assert ndata.shape[0] > 0 -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_edge_storage_data_graph_file(graph_file): - cu_M = utils.read_csv_file(graph_file) - + cu_M = graph_file.get_edgelist().rename( + columns={"src": "0", "dst": "1", "wgt": "2"}) pg = PropertyGraph() gstore = cugraph.gnn.CuGraphStore(graph=pg, backend_lib="cupy") gstore.add_edge_data(cu_M, vertex_col_names=("0", "1"), feat_name="edge_k") @@ -308,10 +285,6 @@ def create_df_from_dataset(col_n, rows): def get_dataset1_CuGraphStore(): - """ - Fixture which returns an instance of a CuGraphStore with vertex and edge - data added from dataset1, parameterized for different DataFrame types. - """ merchant_df = create_df_from_dataset( dataset1["merchants"][0], dataset1["merchants"][1] ) diff --git a/python/cugraph/cugraph/tests/test_hits.py b/python/cugraph/cugraph/tests/test_hits.py index 3ab41951678..0e3c1d8feb4 100644 --- a/python/cugraph/cugraph/tests/test_hits.py +++ b/python/cugraph/cugraph/tests/test_hits.py @@ -20,6 +20,8 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import ( + DATASETS_UNDIRECTED, email_Eu_core, karate) # ============================================================================= @@ -32,8 +34,7 @@ def setup_function(): # ============================================================================= # Pytest fixtures # ============================================================================= -datasets = utils.DATASETS_UNDIRECTED + \ - [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"] +datasets = DATASETS_UNDIRECTED + [email_Eu_core] fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"), ([50], "max_iter"), ([1.0e-6], "tol"), @@ -61,7 +62,8 @@ def input_expected_output(input_combo): # previously on the same input_combo to save their results for re-use # elsewhere. if "nxResults" not in input_combo: - Gnx = utils.generate_nx_graph_from_file(input_combo["graph_file"], + dataset_path = input_combo["graph_file"].get_path() + Gnx = utils.generate_nx_graph_from_file(dataset_path, directed=True) nxResults = nx.hits(Gnx, input_combo["max_iter"], input_combo["tol"], normalized=True) @@ -78,7 +80,8 @@ def test_nx_hits(benchmark, input_combo): cuGraph HITS tests. This is only in place for generating comparison performance numbers. """ - Gnx = utils.generate_nx_graph_from_file(input_combo["graph_file"], + dataset_path = input_combo["graph_file"].get_path() + Gnx = utils.generate_nx_graph_from_file(dataset_path, directed=True) nxResults = benchmark( nx.hits, @@ -91,8 +94,9 @@ def test_nx_hits(benchmark, input_combo): def test_hits(benchmark, input_expected_output): - G = utils.generate_cugraph_graph_from_file( - input_expected_output["graph_file"]) + graph_file = input_expected_output["graph_file"] + + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) cugraph_hits = benchmark(cugraph.hits, G, input_expected_output["max_iter"], @@ -107,7 +111,6 @@ def test_hits(benchmark, input_expected_output): cugraph_hits["nx_hubs"] = cudf.Series.from_pandas(pdf[0]) pdf = pd.DataFrame.from_dict(nx_authorities, orient="index").sort_index() cugraph_hits["nx_authorities"] = cudf.Series.from_pandas(pdf[0]) - hubs_diffs1 = cugraph_hits.query('hubs - nx_hubs > 0.00001') hubs_diffs2 = cugraph_hits.query('hubs - nx_hubs < -0.00001') authorities_diffs1 = cugraph_hits.query( @@ -122,14 +125,8 @@ def test_hits(benchmark, input_expected_output): def test_hits_transposed_false(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate.csv").as_posix() - cu_M = utils.read_csv_file(input_data_path) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", - legacy_renum_only=True, store_transposed=False) + G = karate.get_graph(create_using=cugraph.Graph(directed=True)) warning_msg = ("Pagerank expects the 'store_transposed' " "flag to be set to 'True' for optimal performance during " "the graph creation") diff --git a/python/cugraph/cugraph/tests/test_jaccard.py b/python/cugraph/cugraph/tests/test_jaccard.py index 8881813618c..a6fda5f5af7 100644 --- a/python/cugraph/cugraph/tests/test_jaccard.py +++ b/python/cugraph/cugraph/tests/test_jaccard.py @@ -19,6 +19,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED, netscience # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -69,13 +70,9 @@ def compare_jaccard_two_hop(G, Gnx): assert diff < 1.0e-6 -def cugraph_call(benchmark_callable, cu_M, edgevals=False): +def cugraph_call(benchmark_callable, graph_file, edgevals=False): G = cugraph.Graph() - if edgevals is True: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") - + G = graph_file.get_graph(ignore_weights=not edgevals) # cugraph Jaccard Call df = benchmark_callable(cugraph.jaccard, G) @@ -125,21 +122,22 @@ def networkx_call(M, benchmark_callable=None): # ============================================================================= # Pytest Fixtures # ============================================================================= -@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED) +@pytest.fixture(scope="module", params=DATASETS_UNDIRECTED) def read_csv(request): """ Read csv file for both networkx and cugraph """ - M = utils.read_csv_for_nx(request.param) - cu_M = utils.read_csv_file(request.param) + graph_file = request.param + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) - return M, cu_M + return M, graph_file def test_jaccard(read_csv, gpubenchmark): - M, cu_M = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, cu_M) + M, graph_file = read_csv + cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch @@ -179,15 +177,12 @@ def test_nx_jaccard_time(read_csv, gpubenchmark): nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) -@pytest.mark.parametrize( - "graph_file", - [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"netscience.csv"] -) +@pytest.mark.parametrize("graph_file", [netscience]) def test_jaccard_edgevals(gpubenchmark, graph_file): - - M = utils.read_csv_for_nx(graph_file) - cu_M = utils.read_csv_file(graph_file) - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, cu_M, edgevals=True) + dataset_path = netscience.get_path() + M = utils.read_csv_for_nx(dataset_path) + cu_src, cu_dst, cu_coeff = cugraph_call( + gpubenchmark, netscience, edgevals=True) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch @@ -205,26 +200,25 @@ def test_jaccard_edgevals(gpubenchmark, graph_file): def test_jaccard_two_hop(read_csv): - M, cu_M = read_csv + M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.Graph() ) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph(ignore_weights=True) compare_jaccard_two_hop(G, Gnx) def test_jaccard_two_hop_edge_vals(read_csv): - M, cu_M = read_csv + M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") + + G = graph_file.get_graph() compare_jaccard_two_hop(G, Gnx) diff --git a/python/cugraph/cugraph/tests/test_k_core.py b/python/cugraph/cugraph/tests/test_k_core.py index 0ac299db85f..a50c73c8a70 100644 --- a/python/cugraph/cugraph/tests/test_k_core.py +++ b/python/cugraph/cugraph/tests/test_k_core.py @@ -17,6 +17,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -36,19 +37,18 @@ def calc_k_cores(graph_file, directed=True): # directed is used to create either a Graph or DiGraph so the returned # cugraph can be compared to nx graph of same type. - cu_M = utils.read_csv_file(graph_file) - NM = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) + G = graph_file.get_graph(create_using=cugraph.Graph( + directed=directed), ignore_weights=True) if directed: - G = cugraph.DiGraph() Gnx = nx.from_pandas_edgelist( NM, source="0", target="1", create_using=nx.DiGraph() ) else: - G = cugraph.Graph() Gnx = nx.from_pandas_edgelist( NM, source="0", target="1", create_using=nx.Graph() ) - G.from_cudf_edgelist(cu_M, source="0", destination="1") ck = cugraph.k_core(G) nk = nx.k_core(Gnx) return ck, nk @@ -64,7 +64,7 @@ def compare_edges(cg, nxg): return True -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_k_core_Graph(graph_file): gc.collect() @@ -73,11 +73,11 @@ def test_k_core_Graph(graph_file): assert compare_edges(cu_kcore, nx_kcore) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_k_core_Graph_nx(graph_file): gc.collect() - - NM = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( NM, source="0", target="1", create_using=nx.Graph() ) @@ -87,11 +87,11 @@ def test_k_core_Graph_nx(graph_file): assert nx.is_isomorphic(nc, cc) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_k_core_corenumber_multicolumn(graph_file): gc.collect() - - cu_M = utils.read_csv_file(graph_file) + dataset_path = graph_file.get_path() + cu_M = utils.read_csv_file(dataset_path) cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) cu_M['src_1'] = cu_M['src_0'] + 1000 cu_M['dst_1'] = cu_M['dst_0'] + 1000 diff --git a/python/cugraph/cugraph/tests/test_k_truss_subgraph.py b/python/cugraph/cugraph/tests/test_k_truss_subgraph.py index 4cdba1e62c5..900c63e3fa2 100644 --- a/python/cugraph/cugraph/tests/test_k_truss_subgraph.py +++ b/python/cugraph/cugraph/tests/test_k_truss_subgraph.py @@ -20,6 +20,7 @@ import numpy as np from numba import cuda +from cugraph.experimental.datasets import DATASETS_KTRUSS, karate_asymmetric # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -92,10 +93,9 @@ def test_unsupported_cuda_version(): unsupported env, and not when called in a supported env. """ k = 5 - cu_M = utils.read_csv_file(utils.DATASETS_KTRUSS[0][0]) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") + graph_file = DATASETS_KTRUSS[0][0] + G = graph_file.get_graph() if __cuda_version == __unsupported_cuda_version: with pytest.raises(NotImplementedError): cugraph.k_truss(G, k) @@ -121,11 +121,12 @@ def test_ktruss_subgraph_Graph(graph_file, nx_ground_truth): @pytest.mark.skipif((__cuda_version == __unsupported_cuda_version), reason="skipping on unsupported CUDA " f"{__unsupported_cuda_version} environment.") -@pytest.mark.parametrize("graph_file, nx_ground_truth", utils.DATASETS_KTRUSS) +@pytest.mark.parametrize("graph_file, nx_ground_truth", DATASETS_KTRUSS) def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth): k = 5 - M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) G = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() @@ -140,16 +141,9 @@ def test_ktruss_subgraph_Graph_nx(graph_file, nx_ground_truth): reason="skipping on unsupported CUDA " f"{__unsupported_cuda_version} environment.") def test_ktruss_subgraph_directed_Graph(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate-asymmetric.csv").as_posix() k = 5 edgevals = True - cu_M = utils.read_csv_file(input_data_path) - G = cugraph.Graph(directed=True) - if edgevals: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") - + G = karate_asymmetric.get_graph(create_using=cugraph.Graph( + directed=True), ignore_weights=not edgevals) with pytest.raises(ValueError): cugraph.k_truss(G, k) diff --git a/python/cugraph/cugraph/tests/test_katz_centrality.py b/python/cugraph/cugraph/tests/test_katz_centrality.py index e56b632f676..1fc923c4e9f 100644 --- a/python/cugraph/cugraph/tests/test_katz_centrality.py +++ b/python/cugraph/cugraph/tests/test_katz_centrality.py @@ -18,6 +18,8 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import ( + toy_graph_undirected, karate, DATASETS, DATASETS_UNDIRECTED) # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -31,7 +33,7 @@ import networkx as nx # This toy graph is used in multiple tests throughout libcugraph_c and pylib. -TOY = utils.RAPIDS_DATASET_ROOT_DIR_PATH/'toy_graph_undirected.csv' +TOY = toy_graph_undirected # ============================================================================= @@ -48,10 +50,9 @@ def topKVertices(katz, col, k): def calc_katz(graph_file): - cu_M = utils.read_csv_file(graph_file) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", store_transposed=True) + G = graph_file.get_graph( + create_using=cugraph.Graph( + directed=True), ignore_weights=True) degree_max = G.degree()['degree'].max() katz_alpha = 1 / (degree_max) @@ -59,7 +60,8 @@ def calc_katz(graph_file): k_df = cugraph.katz_centrality(G, alpha=None, max_iter=1000) k_df = k_df.sort_values("vertex").reset_index(drop=True) - NM = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1" ) @@ -70,8 +72,8 @@ def calc_katz(graph_file): return k_df -@pytest.mark.parametrize("graph_file", utils.DATASETS) -def test_katz_centrality_1(graph_file): +@pytest.mark.parametrize("graph_file", DATASETS) +def test_katz_centrality(graph_file): katz_scores = calc_katz(graph_file) topKNX = topKVertices(katz_scores, "nx_katz", 10) @@ -80,9 +82,10 @@ def test_katz_centrality_1(graph_file): assert topKNX.equals(topKCU) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_katz_centrality_nx(graph_file): - NM = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) Gnx = nx.from_pandas_edgelist( NM, create_using=nx.DiGraph(), source="0", target="1", @@ -110,9 +113,10 @@ def test_katz_centrality_nx(graph_file): assert err < (0.1 * len(ck)) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_katz_centrality_multi_column(graph_file): - cu_M = utils.read_csv_file(graph_file) + dataset_path = graph_file.get_path() + cu_M = utils.read_csv_file(dataset_path) cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) cu_M['src_1'] = cu_M['src_0'] + 1000 cu_M['dst_1'] = cu_M['dst_0'] + 1000 @@ -148,12 +152,8 @@ def test_katz_centrality_multi_column(graph_file): @pytest.mark.parametrize("graph_file", [TOY]) def test_katz_centrality_toy(graph_file): # This test is based off of libcugraph_c and pylibcugraph tests - df = cudf.read_csv(graph_file, delimiter=' ', - dtype=['int32', 'int32', 'float32'], header=None) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - df, source='0', destination='1', edge_attr='2', store_transposed=True) - + G = graph_file.get_graph( + create_using=cugraph.Graph(directed=True)) alpha = 0.01 beta = 1.0 tol = 0.000001 @@ -174,13 +174,9 @@ def test_katz_centrality_toy(graph_file): def test_katz_centrality_transposed_false(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate.csv").as_posix() - cu_M = utils.read_csv_file(input_data_path) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", - legacy_renum_only=True, store_transposed=False) + + G = karate.get_graph( + create_using=cugraph.Graph(directed=True)) warning_msg = ("Katz centrality expects the 'store_transposed' " "flag to be set to 'True' for optimal performance during " diff --git a/python/cugraph/cugraph/tests/test_leiden.py b/python/cugraph/cugraph/tests/test_leiden.py index 950f31ca81c..67bb58a9e7a 100644 --- a/python/cugraph/cugraph/tests/test_leiden.py +++ b/python/cugraph/cugraph/tests/test_leiden.py @@ -19,6 +19,7 @@ import networkx as nx import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS, karate_asymmetric # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -60,18 +61,11 @@ def cugraph_louvain(G): return parts, mod -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_leiden(graph_file): edgevals = True - cu_M = utils.read_csv_file(graph_file) - - G = cugraph.Graph() - if edgevals: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") - + G = graph_file.get_graph(ignore_weights=not edgevals) leiden_parts, leiden_mod = cugraph_leiden(G) louvain_parts, louvain_mod = cugraph_louvain(G) @@ -79,11 +73,11 @@ def test_leiden(graph_file): assert leiden_mod >= (0.99 * louvain_mod) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_leiden_nx(graph_file): edgevals = True - - NM = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + NM = utils.read_csv_for_nx(dataset_path) if edgevals: G = nx.from_pandas_edgelist( @@ -102,16 +96,11 @@ def test_leiden_nx(graph_file): def test_leiden_directed_graph(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate-asymmetric.csv").as_posix() edgevals = True - cu_M = utils.read_csv_file(input_data_path) - G = cugraph.Graph(directed=True) - if edgevals: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = karate_asymmetric.get_graph( + create_using=cugraph.Graph( + directed=True), ignore_weights=not edgevals) with pytest.raises(ValueError): parts, mod = cugraph_leiden(G) diff --git a/python/cugraph/cugraph/tests/test_louvain.py b/python/cugraph/cugraph/tests/test_louvain.py index f49b0bdf873..e2b76d8e024 100644 --- a/python/cugraph/cugraph/tests/test_louvain.py +++ b/python/cugraph/cugraph/tests/test_louvain.py @@ -18,6 +18,8 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import ( + DATASETS_UNDIRECTED, karate_asymmetric) # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -48,14 +50,10 @@ def setup_function(): gc.collect() -def cugraph_call(cu_M, edgevals=False, directed=False): - - G = cugraph.Graph(directed=directed) - if edgevals: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") - +def cugraph_call(graph_file, edgevals=False, directed=False): + G = graph_file.get_graph( + create_using=cugraph.Graph( + directed=directed), ignore_weights=not edgevals) # cugraph Louvain Call t1 = time.time() parts, mod = cugraph.louvain(G) @@ -80,13 +78,11 @@ def networkx_call(M): return parts -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_louvain_with_edgevals(graph_file): - - M = utils.read_csv_for_nx(graph_file) - cu_M = utils.read_csv_file(graph_file) - cu_parts, cu_mod = cugraph_call(cu_M, edgevals=True) - + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) + cu_parts, cu_mod = cugraph_call(graph_file, edgevals=True) nx_parts = networkx_call(M) # Calculating modularity scores for comparison Gnx = nx.from_pandas_edgelist( @@ -107,12 +103,11 @@ def test_louvain_with_edgevals(graph_file): assert abs(cu_mod - cu_mod_nx) < 0.0001 -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_louvain(graph_file): - - M = utils.read_csv_for_nx(graph_file) - cu_M = utils.read_csv_file(graph_file) - cu_parts, cu_mod = cugraph_call(cu_M) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) + cu_parts, cu_mod = cugraph_call(graph_file) nx_parts = networkx_call(M) # Calculating modularity scores for comparison @@ -135,10 +130,5 @@ def test_louvain(graph_file): def test_louvain_directed_graph(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate-asymmetric.csv").as_posix() - - cu_M = utils.read_csv_file(input_data_path) - with pytest.raises(ValueError): - cugraph_call(cu_M, directed=True) + cugraph_call(karate_asymmetric, directed=True) diff --git a/python/cugraph/cugraph/tests/test_maximum_spanning_tree.py b/python/cugraph/cugraph/tests/test_maximum_spanning_tree.py index 4574e897700..9df738564ab 100644 --- a/python/cugraph/cugraph/tests/test_maximum_spanning_tree.py +++ b/python/cugraph/cugraph/tests/test_maximum_spanning_tree.py @@ -21,6 +21,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED_WEIGHTS # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -54,12 +55,14 @@ def _get_param_args(param_name, param_values): [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED_WEIGHTS) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED_WEIGHTS) def test_maximum_spanning_tree_nx(graph_file): # cugraph - cuG = utils.read_csv_file(graph_file, read_weights_in_sp=True) - G = cugraph.Graph() - G.from_cudf_edgelist(cuG, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph() + # read_weights_in_sp=False => value column dtype is float64 + G.edgelist.edgelist_df['weights'] = \ + G.edgelist.edgelist_df['weights'].astype("float64") + # Just for getting relevant timing G.view_adj_list() t1 = time.time() @@ -68,7 +71,8 @@ def test_maximum_spanning_tree_nx(graph_file): print("CuGraph time : " + str(t2)) # Nx - df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + df = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight" ) @@ -80,12 +84,13 @@ def test_maximum_spanning_tree_nx(graph_file): utils.compare_mst(cugraph_mst, mst_nx) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED_WEIGHTS) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED_WEIGHTS) @pytest.mark.parametrize(*_get_param_args("use_adjlist", [True, False])) def test_maximum_spanning_tree_graph_repr_compat(graph_file, use_adjlist): - cuG = utils.read_csv_file(graph_file, read_weights_in_sp=True) - G = cugraph.Graph() - G.from_cudf_edgelist(cuG, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph() + # read_weights_in_sp=False => value column dtype is float64 + G.edgelist.edgelist_df['weights'] = \ + G.edgelist.edgelist_df['weights'].astype("float64") if use_adjlist: G.view_adj_list() cugraph.maximum_spanning_tree(G) diff --git a/python/cugraph/cugraph/tests/test_minimum_spanning_tree.py b/python/cugraph/cugraph/tests/test_minimum_spanning_tree.py index 65e0da39715..ee92821f0d7 100644 --- a/python/cugraph/cugraph/tests/test_minimum_spanning_tree.py +++ b/python/cugraph/cugraph/tests/test_minimum_spanning_tree.py @@ -21,7 +21,7 @@ import cugraph from cugraph.testing import utils - +from cugraph.experimental.datasets import DATASETS_UNDIRECTED_WEIGHTS # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -54,12 +54,12 @@ def _get_param_args(param_name, param_values): [pytest.param(v, id=f"{param_name}={v}") for v in param_values]) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED_WEIGHTS) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED_WEIGHTS) def test_minimum_spanning_tree_nx(graph_file): # cugraph - cuG = utils.read_csv_file(graph_file, read_weights_in_sp=True) - G = cugraph.Graph() - G.from_cudf_edgelist(cuG, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph() + G.edgelist.edgelist_df['weights'] = \ + G.edgelist.edgelist_df['weights'].astype("float64") # Just for getting relevant timing G.view_adj_list() t1 = time.time() @@ -68,7 +68,8 @@ def test_minimum_spanning_tree_nx(graph_file): print("CuGraph time : " + str(t2)) # Nx - df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + df = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( df, create_using=nx.Graph(), source="0", target="1", edge_attr="weight" ) @@ -80,12 +81,13 @@ def test_minimum_spanning_tree_nx(graph_file): utils.compare_mst(cugraph_mst, mst_nx) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED_WEIGHTS) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED_WEIGHTS) @pytest.mark.parametrize(*_get_param_args("use_adjlist", [True, False])) def test_minimum_spanning_tree_graph_repr_compat(graph_file, use_adjlist): - cuG = utils.read_csv_file(graph_file, read_weights_in_sp=True) - G = cugraph.Graph() - G.from_cudf_edgelist(cuG, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph() + # read_weights_in_sp=False => value column dtype is float64 + G.edgelist.edgelist_df['weights'] = \ + G.edgelist.edgelist_df['weights'].astype("float64") if use_adjlist: G.view_adj_list() cugraph.minimum_spanning_tree(G) diff --git a/python/cugraph/cugraph/tests/test_modularity.py b/python/cugraph/cugraph/tests/test_modularity.py index d3de71fbe4e..a27c6b6073e 100644 --- a/python/cugraph/cugraph/tests/test_modularity.py +++ b/python/cugraph/cugraph/tests/test_modularity.py @@ -20,6 +20,7 @@ import cugraph from cugraph.testing import utils from cugraph.utilities import ensure_cugraph_obj_for_nx +from cugraph.experimental.datasets import DATASETS import networkx as nx @@ -54,15 +55,16 @@ def random_call(G, partitions): PARTITIONS = [2, 4, 8] -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_modularity_clustering(graph_file, partitions): gc.collect() # Read in the graph and get a cugraph object - cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph() + # read_weights_in_sp=False => value column dtype is float64 + G.edgelist.edgelist_df['weights'] = \ + G.edgelist.edgelist_df['weights'].astype("float64") # Get the modularity score for partitioning versus random assignment cu_score = cugraph_call(G, partitions) @@ -73,11 +75,12 @@ def test_modularity_clustering(graph_file, partitions): assert cu_score > rand_score -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_modularity_clustering_nx(graph_file, partitions): # Read in the graph and get a cugraph object - csv_data = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + csv_data = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) nxG = nx.from_pandas_edgelist( csv_data, @@ -102,11 +105,12 @@ def test_modularity_clustering_nx(graph_file, partitions): assert cu_score > rand_score -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("partitions", PARTITIONS) def test_modularity_clustering_multi_column(graph_file, partitions): # Read in the graph and get a cugraph object - cu_M = utils.read_csv_file(graph_file, read_weights_in_sp=False) + dataset_path = graph_file.get_path() + cu_M = utils.read_csv_file(dataset_path, read_weights_in_sp=False) cu_M.rename(columns={'0': 'src_0', '1': 'dst_0'}, inplace=True) cu_M['src_1'] = cu_M['src_0'] + 1000 cu_M['dst_1'] = cu_M['dst_0'] + 1000 diff --git a/python/cugraph/cugraph/tests/test_multigraph.py b/python/cugraph/cugraph/tests/test_multigraph.py index 1d6aea58051..4647755b879 100644 --- a/python/cugraph/cugraph/tests/test_multigraph.py +++ b/python/cugraph/cugraph/tests/test_multigraph.py @@ -19,6 +19,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS # ============================================================================= @@ -28,14 +29,12 @@ def setup_function(): gc.collect() -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_multigraph(graph_file): # FIXME: Migrate to new test fixtures for Graph setup once available - cuM = utils.read_csv_file(graph_file) - G = cugraph.MultiDiGraph() - G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") - - nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + G = graph_file.get_graph(create_using=cugraph.MultiGraph(directed=True)) + dataset_path = graph_file.get_path() + nxM = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( nxM, source="0", @@ -61,13 +60,12 @@ def test_multigraph(graph_file): assert nxedges.equals(cuedges[["source", "target", "weight"]]) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_Graph_from_MultiGraph(graph_file): # FIXME: Migrate to new test fixtures for Graph setup once available - cuM = utils.read_csv_file(graph_file) - GM = cugraph.MultiGraph() - GM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") - nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + GM = graph_file.get_graph(create_using=cugraph.MultiGraph()) + dataset_path = graph_file.get_path() + nxM = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) GnxM = nx.from_pandas_edgelist( nxM, source="0", @@ -79,9 +77,7 @@ def test_Graph_from_MultiGraph(graph_file): G = cugraph.Graph(GM) Gnx = nx.Graph(GnxM) assert Gnx.number_of_edges() == G.number_of_edges() - - GdM = cugraph.MultiDiGraph() - GdM.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") + GdM = graph_file.get_graph(create_using=cugraph.MultiGraph(directed=True)) GnxdM = nx.from_pandas_edgelist( nxM, source="0", @@ -94,16 +90,15 @@ def test_Graph_from_MultiGraph(graph_file): assert Gnxd.number_of_edges() == Gd.number_of_edges() -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_multigraph_sssp(graph_file): # FIXME: Migrate to new test fixtures for Graph setup once available - cuM = utils.read_csv_file(graph_file) - G = cugraph.MultiDiGraph() - G.from_cudf_edgelist(cuM, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph(create_using=cugraph.MultiGraph(directed=True)) cu_paths = cugraph.sssp(G, 0) max_val = np.finfo(cu_paths["distance"].dtype).max cu_paths = cu_paths[cu_paths["distance"] != max_val] - nxM = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + nxM = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) Gnx = nx.from_pandas_edgelist( nxM, source="0", diff --git a/python/cugraph/cugraph/tests/test_node2vec.py b/python/cugraph/cugraph/tests/test_node2vec.py index 86ddf0454a2..549be42c863 100644 --- a/python/cugraph/cugraph/tests/test_node2vec.py +++ b/python/cugraph/cugraph/tests/test_node2vec.py @@ -19,6 +19,7 @@ from cugraph.testing import utils import cugraph import cudf +from cugraph.experimental.datasets import small_line, karate, DATASETS_SMALL # ============================================================================= @@ -26,8 +27,8 @@ # ============================================================================= DIRECTED_GRAPH_OPTIONS = [False, True] COMPRESSED = [False, True] -LINE = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"small_line.csv" -KARATE = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"karate.csv" +LINE = small_line +KARATE = karate # ============================================================================= @@ -81,8 +82,7 @@ def calc_node2vec(G, def test_node2vec_invalid( graph_file ): - G = utils.generate_cugraph_graph_from_file(graph_file, directed=True, - edgevals=True) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) k = random.randint(1, 10) start_vertices = cudf.Series(random.sample(range(G.number_of_vertices()), k), dtype="int32") @@ -121,8 +121,7 @@ def test_node2vec_invalid( @pytest.mark.parametrize(*_get_param_args("graph_file", [LINE])) @pytest.mark.parametrize(*_get_param_args("directed", DIRECTED_GRAPH_OPTIONS)) def test_node2vec_line(graph_file, directed): - G = utils.generate_cugraph_graph_from_file(graph_file, directed=directed, - edgevals=True) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) max_depth = 3 start_vertices = cudf.Series([0, 3, 6], dtype="int32") df, seeds = calc_node2vec( @@ -135,7 +134,7 @@ def test_node2vec_line(graph_file, directed): ) -@pytest.mark.parametrize(*_get_param_args("graph_file", utils.DATASETS_SMALL)) +@pytest.mark.parametrize(*_get_param_args("graph_file", DATASETS_SMALL)) @pytest.mark.parametrize(*_get_param_args("directed", DIRECTED_GRAPH_OPTIONS)) @pytest.mark.parametrize(*_get_param_args("compress", COMPRESSED)) def test_node2vec( @@ -143,7 +142,8 @@ def test_node2vec( directed, compress, ): - cu_M = utils.read_csv_file(graph_file) + dataset_path = graph_file.get_path() + cu_M = utils.read_csv_file(dataset_path) G = cugraph.Graph(directed=directed) @@ -261,7 +261,8 @@ def test_node2vec_renumber_cudf( graph_file, renumber ): - cu_M = cudf.read_csv(graph_file, delimiter=' ', + dataset_path = graph_file.get_path() + cu_M = cudf.read_csv(dataset_path, delimiter=' ', dtype=['int32', 'int32', 'float32'], header=None) G = cugraph.Graph(directed=True) G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2", diff --git a/python/cugraph/cugraph/tests/test_nx_convert.py b/python/cugraph/cugraph/tests/test_nx_convert.py index 9e716c41027..fc417f9229f 100644 --- a/python/cugraph/cugraph/tests/test_nx_convert.py +++ b/python/cugraph/cugraph/tests/test_nx_convert.py @@ -17,6 +17,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -70,11 +71,12 @@ def _compare_graphs(nxG, cuG, has_wt=True): assert cu_df.to_dict() == nx_df.to_dict() -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_networkx_compatibility(graph_file): # test to make sure cuGraph and Nx build similar Graphs # Read in the graph - M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) # create a NetworkX DiGraph nxG = nx.from_pandas_edgelist( @@ -96,10 +98,11 @@ def test_networkx_compatibility(graph_file): _compare_graphs(nxG, cuG) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_nx_convert_undirected(graph_file): # read data and create a Nx Graph - nx_df = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + nx_df = utils.read_csv_for_nx(dataset_path) nxG = nx.from_pandas_edgelist(nx_df, "0", "1", create_using=nx.Graph) assert nx.is_directed(nxG) is False assert nx.is_weighted(nxG) is False @@ -111,10 +114,11 @@ def test_nx_convert_undirected(graph_file): _compare_graphs(nxG, cuG, has_wt=False) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_nx_convert_directed(graph_file): # read data and create a Nx DiGraph - nx_df = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + nx_df = utils.read_csv_for_nx(dataset_path) nxG = nx.from_pandas_edgelist(nx_df, "0", "1", create_using=nx.DiGraph) assert nxG.is_directed() is True @@ -125,10 +129,11 @@ def test_nx_convert_directed(graph_file): _compare_graphs(nxG, cuG, has_wt=False) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_nx_convert_weighted(graph_file): # read data and create a Nx DiGraph - nx_df = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + nx_df = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) nxG = nx.from_pandas_edgelist(nx_df, "0", "1", "weight", create_using=nx.DiGraph) assert nx.is_directed(nxG) is True @@ -141,10 +146,11 @@ def test_nx_convert_weighted(graph_file): _compare_graphs(nxG, cuG, has_wt=True) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_nx_convert_multicol(graph_file): # read data and create a Nx Graph - nx_df = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + nx_df = utils.read_csv_for_nx(dataset_path) G = nx.DiGraph() diff --git a/python/cugraph/cugraph/tests/test_overlap.py b/python/cugraph/cugraph/tests/test_overlap.py index 03a4395d008..bd8dbd1579c 100644 --- a/python/cugraph/cugraph/tests/test_overlap.py +++ b/python/cugraph/cugraph/tests/test_overlap.py @@ -21,6 +21,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED # ============================================================================= @@ -46,13 +47,11 @@ def compare_overlap(cu_coeff, cpu_coeff): assert diff < 1.0e-6 -def cugraph_call(benchmark_callable, cu_M, pairs, edgevals=False): - G = cugraph.DiGraph() +def cugraph_call(benchmark_callable, graph_file, pairs, edgevals=False): # Device data - if edgevals is True: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph( + create_using=cugraph.Graph( + directed=True), ignore_weights=not edgevals) # cugraph Overlap Call df = benchmark_callable(cugraph.overlap, G, pairs) df = df.sort_values(by=["source", "destination"]) @@ -106,21 +105,20 @@ def cpu_call(M, first, second): # ============================================================================= # Pytest Fixtures # ============================================================================= -@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED) +@pytest.fixture(scope="module", params=DATASETS_UNDIRECTED) def read_csv(request): """ Read csv file for both networkx and cugraph """ - - Mnx = utils.read_csv_for_nx(request.param) + graph_file = request.param + dataset_path = graph_file.get_path() + Mnx = utils.read_csv_for_nx(dataset_path) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 M = scipy.sparse.csr_matrix( (Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N) ) - cu_M = utils.read_csv_file(request.param) - print("cu_M is \n", cu_M) - return M, cu_M + return M, graph_file @pytest.fixture(scope="module") @@ -128,9 +126,8 @@ def extract_two_hop(read_csv): """ Build graph and extract two hop neighbors """ - G = cugraph.Graph() - _, cu_M = read_csv - G.from_cudf_edgelist(cu_M, source="0", destination="1") + _, graph_file = read_csv + G = graph_file.get_graph(ignore_weights=True) pairs = ( G.get_two_hop_neighbors() .sort_values(["first", "second"]) @@ -142,10 +139,10 @@ def extract_two_hop(read_csv): # Test def test_overlap(gpubenchmark, read_csv, extract_two_hop): - M, cu_M = read_csv + M, graph_file = read_csv pairs = extract_two_hop - cu_coeff = cugraph_call(gpubenchmark, cu_M, pairs) + cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs) cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) compare_overlap(cu_coeff, cpu_coeff) @@ -154,19 +151,19 @@ def test_overlap(gpubenchmark, read_csv, extract_two_hop): # Test def test_overlap_edge_vals(gpubenchmark, read_csv, extract_two_hop): - M, cu_M = read_csv + M, graph_file = read_csv pairs = extract_two_hop - cu_coeff = cugraph_call(gpubenchmark, cu_M, pairs, edgevals=True) + cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs, edgevals=True) cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) compare_overlap(cu_coeff, cpu_coeff) -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_overlap_multi_column(graph_file): - - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) diff --git a/python/cugraph/cugraph/tests/test_pagerank.py b/python/cugraph/cugraph/tests/test_pagerank.py index 00439c22f31..d215c57a212 100644 --- a/python/cugraph/cugraph/tests/test_pagerank.py +++ b/python/cugraph/cugraph/tests/test_pagerank.py @@ -20,6 +20,7 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS, karate # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -159,7 +160,7 @@ def setup_function(): # -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("max_iter", MAX_ITERATIONS) @pytest.mark.parametrize("tol", TOLERANCE) @pytest.mark.parametrize("alpha", ALPHA) @@ -172,7 +173,8 @@ def test_pagerank( ): # NetworkX PageRank - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) nnz_vtx = np.unique(M[['0', '1']]) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", @@ -191,11 +193,7 @@ def test_pagerank( cu_prsn = cudify(networkx_prsn) # cuGraph PageRank - cu_M = utils.read_csv_file(graph_file) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", - legacy_renum_only=True, store_transposed=True) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) if has_precomputed_vertex_out_weight == 1: df = G.view_edge_list()[["src", "weights"]] @@ -221,7 +219,7 @@ def test_pagerank( assert err < (0.01 * len(cugraph_pr)) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("max_iter", MAX_ITERATIONS) @pytest.mark.parametrize("tol", TOLERANCE) @pytest.mark.parametrize("alpha", ALPHA) @@ -232,7 +230,8 @@ def test_pagerank_nx( ): # NetworkX PageRank - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) nnz_vtx = np.unique(M[['0', '1']]) Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.DiGraph() @@ -268,7 +267,7 @@ def test_pagerank_nx( assert err < (0.01 * len(cugraph_pr)) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("max_iter", MAX_ITERATIONS) @pytest.mark.parametrize("tol", TOLERANCE) @pytest.mark.parametrize("alpha", ALPHA) @@ -281,7 +280,8 @@ def test_pagerank_multi_column( ): # NetworkX PageRank - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) nnz_vtx = np.unique(M[['0', '1']]) Gnx = nx.from_pandas_edgelist( @@ -363,9 +363,8 @@ def test_pagerank_multi_column( def test_pagerank_invalid_personalization_dtype(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate.csv").as_posix() - M = utils.read_csv_for_nx(input_data_path) + dataset_path = karate.get_path() + M = utils.read_csv_for_nx(dataset_path) G = cugraph.Graph(directed=True) cu_M = cudf.DataFrame() cu_M["src"] = cudf.Series(M["0"]) @@ -390,14 +389,7 @@ def test_pagerank_invalid_personalization_dtype(): def test_pagerank_transposed_false(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate.csv").as_posix() - cu_M = utils.read_csv_file(input_data_path) - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist( - cu_M, source="0", destination="1", edge_attr="2", - legacy_renum_only=True, store_transposed=False) - + G = karate.get_graph(create_using=cugraph.Graph(directed=True)) warning_msg = ("Pagerank expects the 'store_transposed' " "flag to be set to 'True' for optimal performance during " "the graph creation") diff --git a/python/cugraph/cugraph/tests/test_property_graph.py b/python/cugraph/cugraph/tests/test_property_graph.py index 3d263d4ab73..5bb81c2b05d 100644 --- a/python/cugraph/cugraph/tests/test_property_graph.py +++ b/python/cugraph/cugraph/tests/test_property_graph.py @@ -19,6 +19,7 @@ import numpy as np import cudf from cudf.testing import assert_frame_equal, assert_series_equal +from cugraph.experimental.datasets import cyber # If the rapids-pytest-benchmark plugin is installed, the "gpubenchmark" # fixture will be available automatically. Check that this fixture is available @@ -320,19 +321,12 @@ def cyber_PropertyGraph(request): from cugraph.experimental import PropertyGraph dataframe_type = request.param[0] - cyber_csv = utils.RAPIDS_DATASET_ROOT_DIR_PATH/"cyber.csv" source_col_name = "srcip" dest_col_name = "dstip" + df = cyber.get_edgelist() if dataframe_type is pd.DataFrame: - read_csv = pd.read_csv - else: - read_csv = cudf.read_csv - df = read_csv(cyber_csv, delimiter=",", - dtype={"idx": "int32", - source_col_name: "str", - dest_col_name: "str"}, - header=0) + df = df.to_pandas() pG = PropertyGraph() pG.add_edge_data(df, (source_col_name, dest_col_name)) diff --git a/python/cugraph/cugraph/tests/test_random_walks.py b/python/cugraph/cugraph/tests/test_random_walks.py index 3deb9a39ad0..a750ddbccbc 100644 --- a/python/cugraph/cugraph/tests/test_random_walks.py +++ b/python/cugraph/cugraph/tests/test_random_walks.py @@ -17,17 +17,16 @@ import pytest from cudf.testing import assert_series_equal -from cugraph.testing import utils import cugraph - +from cugraph.experimental.datasets import DATASETS, DATASETS_SMALL # ============================================================================= # Parameters # ============================================================================= DIRECTED_GRAPH_OPTIONS = [False, True] WEIGHTED_GRAPH_OPTIONS = [False, True] -DATASETS = [pytest.param(d) for d in utils.DATASETS] -DATASETS_SMALL = [pytest.param(d) for d in utils.DATASETS_SMALL] +DATASETS = [pytest.param(d) for d in DATASETS] +DATASETS_SMALL = [pytest.param(d) for d in DATASETS_SMALL] # ============================================================================= @@ -74,8 +73,7 @@ def calc_random_walks(graph_file, sizes: int The path size in case of coalesced paths. """ - G = utils.generate_cugraph_graph_from_file( - graph_file, directed=directed, edgevals=True) + G = graph_file.get_graph(create_using=cugraph.Graph(directed=directed)) assert G is not None k = random.randint(1, 10) @@ -123,7 +121,7 @@ def check_random_walks(path_data, seeds, df_G=None): assert invalid_seeds == 0 -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) @pytest.mark.parametrize("max_depth", [None]) def test_random_walks_invalid_max_dept(graph_file, @@ -137,16 +135,14 @@ def test_random_walks_invalid_max_dept(graph_file, ) -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_random_walks_coalesced( graph_file, directed ): max_depth = random.randint(2, 10) - df_G = utils.read_csv_file(graph_file) - df_G.rename( - columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) + df_G = graph_file.get_edgelist() path_data, seeds = calc_random_walks( graph_file, directed, @@ -164,16 +160,13 @@ def test_random_walks_coalesced( assert df['weight_offsets'].to_numpy().tolist() == w_offsets -@pytest.mark.parametrize("graph_file", utils.DATASETS_SMALL) +@pytest.mark.parametrize("graph_file", DATASETS_SMALL) @pytest.mark.parametrize("directed", DIRECTED_GRAPH_OPTIONS) def test_random_walks_padded( graph_file, directed ): max_depth = random.randint(2, 10) - df_G = utils.read_csv_file(graph_file) - df_G.rename( - columns={"0": "src", "1": "dst", "2": "weight"}, inplace=True) path_data, seeds = calc_random_walks( graph_file, directed, diff --git a/python/cugraph/cugraph/tests/test_renumber.py b/python/cugraph/cugraph/tests/test_renumber.py index f0d37cade38..037bec398d8 100644 --- a/python/cugraph/cugraph/tests/test_renumber.py +++ b/python/cugraph/cugraph/tests/test_renumber.py @@ -22,6 +22,7 @@ from cugraph.structure.number_map import NumberMap from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS def test_renumber_ips(): @@ -195,11 +196,11 @@ def test_renumber_negative_col(): ) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_renumber_files(graph_file): gc.collect() - - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) @@ -233,11 +234,11 @@ def test_renumber_files(graph_file): check_names=False) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_renumber_files_col(graph_file): gc.collect() - - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) @@ -271,11 +272,11 @@ def test_renumber_files_col(graph_file): check_names=False) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_renumber_files_multi_col(graph_file): gc.collect() - - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) sources = cudf.Series(M["0"]) destinations = cudf.Series(M["1"]) diff --git a/python/cugraph/cugraph/tests/test_sorensen.py b/python/cugraph/cugraph/tests/test_sorensen.py index bfae8662409..e8f8ff44961 100644 --- a/python/cugraph/cugraph/tests/test_sorensen.py +++ b/python/cugraph/cugraph/tests/test_sorensen.py @@ -19,6 +19,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED, netscience # Temporarily suppress warnings till networkX fixes deprecation warnings # (Using or importing the ABCs from 'collections' instead of from @@ -74,12 +75,8 @@ def compare_sorensen_two_hop(G, Gnx): assert diff < 1.0e-6 -def cugraph_call(benchamrk_callable, cu_M, edgevals=False): - G = cugraph.Graph() - if edgevals is True: - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") - else: - G.from_cudf_edgelist(cu_M, source="0", destination="1") +def cugraph_call(benchamrk_callable, graph_file, edgevals=False): + G = graph_file.get_graph(ignore_weights=not edgevals) # cugraph sorensen Call df = benchamrk_callable(cugraph.sorensen, G) @@ -133,21 +130,22 @@ def networkx_call(M, benchmark_callable=None): # ============================================================================= # Pytest Fixtures # ============================================================================= -@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED) +@pytest.fixture(scope="module", params=DATASETS_UNDIRECTED) def read_csv(request): """ Read csv file for both networkx and cugraph """ - M = utils.read_csv_for_nx(request.param) - cu_M = utils.read_csv_file(request.param) + graph_file = request.param + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) - return M, cu_M + return M, graph_file def test_sorensen(gpubenchmark, read_csv): - M, cu_M = read_csv - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, cu_M) + M, graph_file = read_csv + cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, graph_file) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch @@ -169,15 +167,13 @@ def test_nx_sorensen_time(gpubenchmark, read_csv): nx_src, nx_dst, nx_coeff = networkx_call(M, gpubenchmark) -@pytest.mark.parametrize( - "graph_file", - [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"netscience.csv"] -) +@pytest.mark.parametrize("graph_file", [netscience]) def test_sorensen_edgevals(gpubenchmark, graph_file): + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) - M = utils.read_csv_for_nx(graph_file) - cu_M = utils.read_csv_file(graph_file) - cu_src, cu_dst, cu_coeff = cugraph_call(gpubenchmark, cu_M, edgevals=True) + cu_src, cu_dst, cu_coeff = cugraph_call( + gpubenchmark, graph_file, edgevals=True) nx_src, nx_dst, nx_coeff = networkx_call(M) # Calculating mismatch @@ -195,26 +191,24 @@ def test_sorensen_edgevals(gpubenchmark, graph_file): def test_sorensen_two_hop(read_csv): - M, cu_M = read_csv + M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( M, source="0", target="1", create_using=nx.Graph() ) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph(ignore_weights=True) compare_sorensen_two_hop(G, Gnx) def test_sorensen_two_hop_edge_vals(read_csv): - M, cu_M = read_csv + M, graph_file = read_csv Gnx = nx.from_pandas_edgelist( M, source="0", target="1", edge_attr="weight", create_using=nx.Graph() ) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1", edge_attr="2") + G = graph_file.get_graph() compare_sorensen_two_hop(G, Gnx) diff --git a/python/cugraph/cugraph/tests/test_sssp.py b/python/cugraph/cugraph/tests/test_sssp.py index ac6c7662855..78973ea0a3e 100644 --- a/python/cugraph/cugraph/tests/test_sssp.py +++ b/python/cugraph/cugraph/tests/test_sssp.py @@ -28,6 +28,7 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental import datasets # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -135,7 +136,8 @@ def cugraph_call(gpu_benchmark_callable, input_G_or_matrix, def networkx_call(graph_file, source, edgevals=False): - M = utils.read_csv_for_nx(graph_file, read_weights_in_sp=True) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path, read_weights_in_sp=True) # Directed NetworkX graph edge_attr = "weight" if edgevals else None Gnx = nx.from_pandas_edgelist( @@ -153,10 +155,14 @@ def networkx_call(graph_file, source, edgevals=False): else: nx_paths = nx.single_source_dijkstra_path_length(Gnx, source) + G = graph_file.get_graph( + create_using=cugraph.Graph( + directed=True), ignore_weights=not edgevals) + t2 = time.time() - t1 print("NX Time : " + str(t2)) - return (graph_file, source, nx_paths, Gnx) + return (G, dataset_path, source, nx_paths, Gnx) # ============================================================================= @@ -168,7 +174,7 @@ def networkx_call(graph_file, source, edgevals=False): # not do this automatically (unlike multiply-parameterized tests). The 2nd # item in the tuple is a label for the param value used when displaying the # full test name. -DATASETS = [pytest.param(d) for d in utils.DATASETS] +DATASETS = [pytest.param(d) for d in datasets.DATASETS] SOURCES = [pytest.param(1)] fixture_params = utils.genFixtureParamsProduct((DATASETS, "ds"), (SOURCES, "src")) @@ -206,10 +212,14 @@ def single_dataset_source_nxresults_weighted(request): @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES) def test_sssp(gpubenchmark, dataset_source_nxresults, cugraph_input_type): # Extract the params generated from the fixture - (graph_file, source, nx_paths, Gnx) = dataset_source_nxresults + (G, dataset_path, source, nx_paths, Gnx) = dataset_source_nxresults + + if not isinstance(cugraph_input_type, (cugraph.Graph, cugraph.DiGraph)): + input_G_or_matrix = utils.create_obj_from_csv( + dataset_path, cugraph_input_type) + else: + input_G_or_matrix = G - input_G_or_matrix = utils.create_obj_from_csv(graph_file, - cugraph_input_type) cu_paths, max_val = cugraph_call(gpubenchmark, input_G_or_matrix, source) # Calculating mismatch @@ -235,23 +245,11 @@ def test_sssp(gpubenchmark, dataset_source_nxresults, cugraph_input_type): @pytest.mark.parametrize("cugraph_input_type", utils.CUGRAPH_DIR_INPUT_TYPES) def test_sssp_invalid_start(gpubenchmark, dataset_source_nxresults, cugraph_input_type): - (graph_file, source, nx_paths, Gnx) = dataset_source_nxresults - el = cudf.read_csv( - graph_file, - sep=' ', - dtype=['int32', 'int32', 'float32'], - names=['src', 'tar', 'w'] - ).dropna() - newval = max(el.src.max(), el.tar.max()) + 1 - el.src = el.src.replace(source, newval) - el.tar = el.tar.replace(source, newval) - G = cugraph.from_cudf_edgelist( - el, - source='src', - destination='tar', - edge_attr='w', - renumber=True - ) + (G, _, source, nx_paths, Gnx) = dataset_source_nxresults + el = G.view_edge_list() + + newval = max(el.src.max(), el.dst.max()) + 1 + source = newval with pytest.raises(ValueError): cugraph_call(gpubenchmark, G, source) @@ -271,11 +269,9 @@ def test_sssp_nonnative_inputs(gpubenchmark, def test_sssp_edgevals(gpubenchmark, dataset_source_nxresults_weighted, cugraph_input_type): # Extract the params generated from the fixture - (graph_file, source, nx_paths, Gnx) = dataset_source_nxresults_weighted + (G, _, source, nx_paths, Gnx) = dataset_source_nxresults_weighted + input_G_or_matrix = G - input_G_or_matrix = utils.create_obj_from_csv(graph_file, - cugraph_input_type, - edgevals=True) cu_paths, max_val = cugraph_call(gpubenchmark, input_G_or_matrix, source, edgevals=True) @@ -312,11 +308,12 @@ def test_sssp_edgevals_nonnative_inputs( cugraph_input_type) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) @pytest.mark.parametrize("source", SOURCES) def test_sssp_data_type_conversion(graph_file, source): - M = utils.read_csv_for_nx(graph_file) - cu_M = utils.read_csv_file(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) + cu_M = utils.read_csv_file(dataset_path) # cugraph call with int32 weights cu_M["2"] = cu_M["2"].astype(np.int32) @@ -367,11 +364,10 @@ def test_sssp_data_type_conversion(graph_file, source): def test_scipy_api_compat(): - graph_file = utils.DATASETS[0] - - input_cugraph_graph = utils.create_obj_from_csv(graph_file, cugraph.Graph, - edgevals=True) - input_coo_matrix = utils.create_obj_from_csv(graph_file, cp_coo_matrix, + graph_file = datasets.DATASETS[0] + dataset_path = graph_file.get_path() + input_cugraph_graph = graph_file.get_graph() + input_coo_matrix = utils.create_obj_from_csv(dataset_path, cp_coo_matrix, edgevals=True) # Ensure scipy-only options are rejected for cugraph inputs diff --git a/python/cugraph/cugraph/tests/test_subgraph_extraction.py b/python/cugraph/cugraph/tests/test_subgraph_extraction.py index cd44030083d..cee65378e0b 100644 --- a/python/cugraph/cugraph/tests/test_subgraph_extraction.py +++ b/python/cugraph/cugraph/tests/test_subgraph_extraction.py @@ -20,6 +20,7 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS, karate ############################################################################### @@ -68,9 +69,10 @@ def nx_call(M, verts, directed=True): ############################################################################### -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_subgraph_extraction_DiGraph(graph_file): - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) verts = np.zeros(3, dtype=np.int32) verts[0] = 0 verts[1] = 1 @@ -80,9 +82,10 @@ def test_subgraph_extraction_DiGraph(graph_file): assert compare_edges(cu_sg, nx_sg) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_subgraph_extraction_Graph(graph_file): - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) verts = np.zeros(3, dtype=np.int32) verts[0] = 0 verts[1] = 1 @@ -92,15 +95,15 @@ def test_subgraph_extraction_Graph(graph_file): assert compare_edges(cu_sg, nx_sg) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_subgraph_extraction_Graph_nx(graph_file): directed = False verts = np.zeros(3, dtype=np.int32) verts[0] = 0 verts[1] = 1 verts[2] = 17 - - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) if directed: G = nx.from_pandas_edgelist( @@ -120,9 +123,10 @@ def test_subgraph_extraction_Graph_nx(graph_file): assert nx_sub.has_edge(u, v) -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_subgraph_extraction_multi_column(graph_file): - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) @@ -160,13 +164,11 @@ def test_subgraph_extraction_graph_not_renumbered(): """ Ensure subgraph() works with a Graph that has not been renumbered """ - graph_file = utils.RAPIDS_DATASET_ROOT_DIR_PATH / "karate.csv" - gdf = cudf.read_csv(graph_file, delimiter=" ", - dtype=["int32", "int32", "float32"], header=None) + gdf = karate.get_edgelist() verts = np.array([0, 1, 2], dtype=np.int32) sverts = cudf.Series(verts) G = cugraph.Graph() - G.from_cudf_edgelist(gdf, source="0", destination="1", renumber=False) + G.from_cudf_edgelist(gdf, source="src", destination="dst", renumber=False) Sg = cugraph.subgraph(G, sverts) assert Sg.number_of_vertices() == 3 diff --git a/python/cugraph/cugraph/tests/test_symmetrize.py b/python/cugraph/cugraph/tests/test_symmetrize.py index ecfee359a0b..1e6b631cbc9 100644 --- a/python/cugraph/cugraph/tests/test_symmetrize.py +++ b/python/cugraph/cugraph/tests/test_symmetrize.py @@ -18,7 +18,7 @@ import pandas as pd import cudf import cugraph -from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS def test_version(): @@ -148,21 +148,21 @@ def compare(src1, dst1, val1, src2, dst2, val2): @pytest.mark.skip("debugging") -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_symmetrize_unweighted(graph_file): gc.collect() - cu_M = utils.read_csv_file(graph_file) - - sym_sources, sym_destinations = cugraph.symmetrize(cu_M["0"], cu_M["1"]) + cu_M = graph_file.get_edgelist() + sym_sources, sym_destinations = cugraph.symmetrize( + cu_M["src"], cu_M["dst"]) # # Check to see if all pairs in sources/destinations exist in # both directions # compare( - cu_M["0"], - cu_M["1"], + cu_M["src"], + cu_M["dst"], None, sym_sources, sym_destinations, @@ -171,14 +171,13 @@ def test_symmetrize_unweighted(graph_file): @pytest.mark.skip("debugging") -@pytest.mark.parametrize("graph_file", utils.DATASETS) +@pytest.mark.parametrize("graph_file", DATASETS) def test_symmetrize_weighted(graph_file): gc.collect() - - cu_M = utils.read_csv_file(graph_file) + cu_M = graph_file.get_edgelist() sym_src, sym_dst, sym_w = cugraph.symmetrize( - cu_M["0"], cu_M["1"], cu_M["2"] + cu_M["src"], cu_M["dst"], cu_M["wgt"] ) - compare(cu_M["0"], cu_M["1"], cu_M["2"], sym_src, sym_dst, sym_w) + compare(cu_M["src"], cu_M["dst"], cu_M["wgt"], sym_src, sym_dst, sym_w) diff --git a/python/cugraph/cugraph/tests/test_triangle_count.py b/python/cugraph/cugraph/tests/test_triangle_count.py index da30474ff59..82c9092da7b 100644 --- a/python/cugraph/cugraph/tests/test_triangle_count.py +++ b/python/cugraph/cugraph/tests/test_triangle_count.py @@ -19,6 +19,8 @@ import cudf import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import ( + DATASETS_UNDIRECTED, karate_asymmetric) # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -43,7 +45,7 @@ def setup_function(): # ============================================================================= # Pytest fixtures # ============================================================================= -datasets = utils.DATASETS_UNDIRECTED +datasets = DATASETS_UNDIRECTED fixture_params = utils.genFixtureParamsProduct((datasets, "graph_file"), ([True, False], "edgevals"), ([True, False], "start_list"), @@ -59,11 +61,11 @@ def input_combo(request): parameters = dict( zip(("graph_file", "edgevals", "start_list"), request.param)) - input_data_path = parameters["graph_file"] + graph_file = parameters["graph_file"] + input_data_path = graph_file.get_path() edgevals = parameters["edgevals"] - G = utils.generate_cugraph_graph_from_file( - input_data_path, directed=False, edgevals=edgevals) + G = graph_file.get_graph(ignore_weights=not edgevals) Gnx = utils.generate_nx_graph_from_file( input_data_path, directed=False, edgevals=edgevals) @@ -111,13 +113,10 @@ def test_triangles_int64(input_combo): Gnx = input_combo["Gnx"] count_legacy_32 = cugraph.triangle_count(Gnx) - graph_files = input_combo["graph_file"] - gdf = cudf.read_csv(graph_files, - delimiter=' ', - dtype=['int64', 'int64', 'float32'], - header=None) - G = cugraph.Graph() - G.from_cudf_edgelist(gdf, source='0', destination='1', edge_attr='2') + graph_file = input_combo["graph_file"] + G = graph_file.get_graph() + G.edgelist.edgelist_df = G.edgelist.edgelist_df.astype( + {"src": "int64", "dst": "int64"}) count_exp_64 = cugraph.triangle_count(G).sort_values( "vertex").reset_index(drop=True).rename(columns={ @@ -135,14 +134,9 @@ def test_triangles_no_weights(input_combo): "vertex").reset_index(drop=True).rename(columns={ "counts": "exp_cugraph_counts"}) - graph_files = input_combo["graph_file"] - gdf = cudf.read_csv(graph_files, - delimiter=' ', - dtype=['int32', 'int32', 'float64'], - header=None) - G = cugraph.Graph() - gdf = gdf.drop('2', axis=1) - G.from_cudf_edgelist(gdf, source='0', destination='1') + graph_file = input_combo["graph_file"] + G = graph_file.get_graph(ignore_weights=True) + assert (G.is_weighted() is False) triangle_count = cugraph.triangle_count(G).sort_values( "vertex").reset_index(drop=True).rename(columns={ @@ -153,8 +147,7 @@ def test_triangles_no_weights(input_combo): def test_triangles_directed_graph(): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "karate-asymmetric.csv").as_posix() + input_data_path = karate_asymmetric.get_path() M = utils.read_csv_for_nx(input_data_path) G = cugraph.Graph(directed=True) cu_M = cudf.DataFrame() diff --git a/python/cugraph/cugraph/tests/test_uniform_neighbor_sample.py b/python/cugraph/cugraph/tests/test_uniform_neighbor_sample.py index 5c326ef2087..207bff213c3 100644 --- a/python/cugraph/cugraph/tests/test_uniform_neighbor_sample.py +++ b/python/cugraph/cugraph/tests/test_uniform_neighbor_sample.py @@ -16,6 +16,8 @@ import cudf from cugraph.testing import utils from cugraph import uniform_neighbor_sample +from cugraph.experimental.datasets import ( + DATASETS_UNDIRECTED, email_Eu_core, small_tree) import random @@ -31,8 +33,7 @@ def setup_function(): # ============================================================================= IS_DIRECTED = [True, False] -datasets = utils.DATASETS_UNDIRECTED + \ - [utils.RAPIDS_DATASET_ROOT_DIR_PATH/"email-Eu-core.csv"] +datasets = DATASETS_UNDIRECTED + [email_Eu_core] fixture_params = utils.genFixtureParamsProduct( (datasets, "graph_file"), @@ -55,7 +56,7 @@ def input_combo(request): indices_type = parameters["indices_type"] - input_data_path = parameters["graph_file"] + input_data_path = parameters["graph_file"].get_path() directed = parameters["directed"] df = cudf.read_csv( @@ -170,8 +171,7 @@ def test_uniform_neighbor_sample_simple(input_combo): @pytest.mark.parametrize("directed", IS_DIRECTED) def test_uniform_neighbor_sample_tree(directed): - input_data_path = (utils.RAPIDS_DATASET_ROOT_DIR_PATH / - "small_tree.csv").as_posix() + input_data_path = small_tree.get_path() df = cudf.read_csv( input_data_path, diff --git a/python/cugraph/cugraph/tests/test_utils.py b/python/cugraph/cugraph/tests/test_utils.py index 03a1f7c103a..c055cd91b22 100644 --- a/python/cugraph/cugraph/tests/test_utils.py +++ b/python/cugraph/cugraph/tests/test_utils.py @@ -12,26 +12,20 @@ # limitations under the License. import gc -from pathlib import PurePath import pytest import cugraph import cudf from cugraph.testing import utils +from cugraph.experimental.datasets import karate import numpy as np def test_bfs_paths(): with pytest.raises(ValueError) as ErrorMsg: gc.collect() - - graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv" - - cu_M = utils.read_csv_file(graph_file) - - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') + G = karate.get_graph() # run BFS starting at vertex 17 df = cugraph.bfs(G, 16) @@ -50,13 +44,7 @@ def test_bfs_paths(): def test_bfs_paths_array(): with pytest.raises(ValueError) as ErrorMsg: gc.collect() - - graph_file = PurePath(utils.RAPIDS_DATASET_ROOT_DIR)/"karate.csv" - - cu_M = utils.read_csv_file(graph_file) - - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source='0', destination='1', edge_attr='2') + G = karate.get_graph() # run BFS starting at vertex 17 df = cugraph.bfs(G, 16) diff --git a/python/cugraph/cugraph/tests/test_wjaccard.py b/python/cugraph/cugraph/tests/test_wjaccard.py index 23a778591af..5834c1b96c1 100644 --- a/python/cugraph/cugraph/tests/test_wjaccard.py +++ b/python/cugraph/cugraph/tests/test_wjaccard.py @@ -21,6 +21,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -44,17 +45,18 @@ def setup_function(): gc.collect() -def cugraph_call(benchmark_callable, cu_M): +def cugraph_call(benchmark_callable, graph_file): # Device data + cu_M = graph_file.get_edgelist() weight_arr = cudf.Series( - np.ones(max(cu_M["0"].max(), cu_M["1"].max()) + 1, dtype=np.float32) + np.ones( + max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) ) weights = cudf.DataFrame() weights['vertex'] = np.arange(len(weight_arr), dtype=np.int32) weights['weight'] = weight_arr - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph(ignore_weights=True) # cugraph Jaccard Call df = benchmark_callable(cugraph.jaccard_w, G, weights) @@ -98,22 +100,23 @@ def networkx_call(M, benchmark_callable=None): # ============================================================================= # Pytest Fixtures # ============================================================================= -@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED) +@pytest.fixture(scope="module", params=DATASETS_UNDIRECTED) def read_csv(request): """ Read csv file for both networkx and cugraph """ - M = utils.read_csv_for_nx(request.param) - cu_M = utils.read_csv_file(request.param) + graph_file = request.param + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) - return M, cu_M + return M, graph_file def test_wjaccard(gpubenchmark, read_csv): - M, cu_M = read_csv + M, graph_file = read_csv - cu_coeff = cugraph_call(gpubenchmark, cu_M) + cu_coeff = cugraph_call(gpubenchmark, graph_file) nx_coeff = networkx_call(M) for i in range(len(cu_coeff)): diff = abs(nx_coeff[i] - cu_coeff[i]) @@ -128,9 +131,9 @@ def test_nx_wjaccard_time(gpubenchmark, read_csv): def test_wjaccard_multi_column_weights(gpubenchmark, read_csv): - M, cu_M = read_csv + M, graph_file = read_csv - cu_coeff = cugraph_call(gpubenchmark, cu_M) + cu_coeff = cugraph_call(gpubenchmark, graph_file) nx_coeff = networkx_call(M) for i in range(len(cu_coeff)): diff = abs(nx_coeff[i] - cu_coeff[i]) diff --git a/python/cugraph/cugraph/tests/test_woverlap.py b/python/cugraph/cugraph/tests/test_woverlap.py index 419d60a1cd6..e95a2e5ad88 100644 --- a/python/cugraph/cugraph/tests/test_woverlap.py +++ b/python/cugraph/cugraph/tests/test_woverlap.py @@ -18,6 +18,7 @@ import numpy as np import cudf from cudf.testing import assert_series_equal +from cugraph.experimental.datasets import DATASETS_UNDIRECTED import cugraph from cugraph.testing import utils @@ -30,17 +31,18 @@ def setup_function(): gc.collect() -def cugraph_call(benchmark_callable, cu_M, pairs): +def cugraph_call(benchmark_callable, graph_file, pairs): # Device data + cu_M = graph_file.get_edgelist() weights_arr = cudf.Series( - np.ones(max(cu_M["0"].max(), cu_M["1"].max()) + 1, dtype=np.float32) + np.ones( + max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) ) weights = cudf.DataFrame() weights['vertex'] = np.arange(len(weights_arr), dtype=np.int32) weights['weight'] = weights_arr - G = cugraph.Graph(directed=True) - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph(create_using=cugraph.Graph(directed=True)) # cugraph Overlap Call df = benchmark_callable(cugraph.overlap_w, G, weights, pairs) @@ -92,25 +94,23 @@ def cpu_call(M, first, second): return result -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_woverlap(gpubenchmark, graph_file): - - Mnx = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + Mnx = utils.read_csv_for_nx(dataset_path) N = max(max(Mnx["0"]), max(Mnx["1"])) + 1 M = scipy.sparse.csr_matrix( (Mnx.weight, (Mnx["0"], Mnx["1"])), shape=(N, N) ) - cu_M = utils.read_csv_file(graph_file) - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph(ignore_weights=True) pairs = ( G.get_two_hop_neighbors() .sort_values(["first", "second"]) .reset_index(drop=True) ) - cu_coeff = cugraph_call(gpubenchmark, cu_M, pairs) + cu_coeff = cugraph_call(gpubenchmark, graph_file, pairs) cpu_coeff = cpu_call(M, pairs["first"], pairs["second"]) assert len(cu_coeff) == len(cpu_coeff) for i in range(len(cu_coeff)): @@ -123,10 +123,10 @@ def test_woverlap(gpubenchmark, graph_file): assert diff < 1.0e-6 -@pytest.mark.parametrize("graph_file", utils.DATASETS_UNDIRECTED) +@pytest.mark.parametrize("graph_file", DATASETS_UNDIRECTED) def test_woverlap_multi_column(graph_file): - - M = utils.read_csv_for_nx(graph_file) + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) cu_M = cudf.DataFrame() cu_M["src_0"] = cudf.Series(M["0"]) diff --git a/python/cugraph/cugraph/tests/test_wsorensen.py b/python/cugraph/cugraph/tests/test_wsorensen.py index 45bd3662a88..ad2b6d05639 100644 --- a/python/cugraph/cugraph/tests/test_wsorensen.py +++ b/python/cugraph/cugraph/tests/test_wsorensen.py @@ -21,6 +21,7 @@ import cugraph from cugraph.testing import utils +from cugraph.experimental.datasets import DATASETS_UNDIRECTED # Temporarily suppress warnings till networkX fixes deprecation warnings @@ -44,17 +45,18 @@ def setup_function(): gc.collect() -def cugraph_call(benchmark_callable, cu_M): +def cugraph_call(benchmark_callable, graph_file): # Device data + cu_M = graph_file.get_edgelist() weight_arr = cudf.Series( - np.ones(max(cu_M["0"].max(), cu_M["1"].max()) + 1, dtype=np.float32) + np.ones( + max(cu_M["src"].max(), cu_M["dst"].max()) + 1, dtype=np.float32) ) weights = cudf.DataFrame() weights['vertex'] = np.arange(len(weight_arr), dtype=np.int32) weights['weight'] = weight_arr - G = cugraph.Graph() - G.from_cudf_edgelist(cu_M, source="0", destination="1") + G = graph_file.get_graph(ignore_weights=True) # cugraph Sorensen Call df = benchmark_callable(cugraph.sorensen_w, G, weights) @@ -102,22 +104,23 @@ def networkx_call(M, benchmark_callable=None): # ============================================================================= # Pytest Fixtures # ============================================================================= -@pytest.fixture(scope="module", params=utils.DATASETS_UNDIRECTED) +@pytest.fixture(scope="module", params=DATASETS_UNDIRECTED) def read_csv(request): """ Read csv file for both networkx and cugraph """ - M = utils.read_csv_for_nx(request.param) - cu_M = utils.read_csv_file(request.param) + graph_file = request.param + dataset_path = graph_file.get_path() + M = utils.read_csv_for_nx(dataset_path) - return M, cu_M + return M, graph_file def test_wsorensen(gpubenchmark, read_csv): - M, cu_M = read_csv + M, graph_file = read_csv - cu_coeff = cugraph_call(gpubenchmark, cu_M) + cu_coeff = cugraph_call(gpubenchmark, graph_file) nx_coeff = networkx_call(M) for i in range(len(cu_coeff)): diff = abs(nx_coeff[i] - cu_coeff[i])