From e8eddcedeef9f46deab561b4dc62a975a7d7c60e Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 10:55:45 +0100 Subject: [PATCH 01/11] Remove sk-network dependencies. Implement distance with scipy --- kglab/networks.py | 20 +++++++++++--------- kglab/subg.py | 4 ++-- requirements.txt | 3 +-- tests/test_networks.py | 21 ++++++++++++++------- 4 files changed, 28 insertions(+), 20 deletions(-) diff --git a/kglab/networks.py b/kglab/networks.py index 328a5d7..0633610 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -6,7 +6,8 @@ see license https://github.com/DerwenAI/kglab#license-and-copyright """ -import sknetwork as skn +import networkx as nx +from scipy.spatial.distance import pdist, squareform class NetAnalysisMixin: """ @@ -17,7 +18,7 @@ def get_distances(self, adj_mtx): Compute distances according to an adjacency matrix. """ self.check_attributes() - return skn.path.get_distances(adj_mtx) + return squareform(pdist(adj_mtx, metric='euclidean')) def get_shortest_path(self, adj_matx, src, dst): """ @@ -34,10 +35,11 @@ def get_shortest_path(self, adj_matx, src, dst): list of int: a path of indices """ self.check_attributes() - return skn.path.get_shortest_path(adj_matx, src, dst) - - -# number of nodes, number of edges -# density -# triangles -# reciprocity \ No newline at end of file + return nx.shortest_path(self.nx_graph, source=src, target=dst) + + def describe(self): + # number of nodes, number of edges + # density + # triangles + # reciprocity + raise NotImplementedError() \ No newline at end of file diff --git a/kglab/subg.py b/kglab/subg.py index 9c9a8e4..d61a9dc 100644 --- a/kglab/subg.py +++ b/kglab/subg.py @@ -25,7 +25,7 @@ import cugraph # type: ignore # pylint: disable=W0611 -class Subgraph(AlgebraMixin, NetAnalysisMixin): +class Subgraph: """ Base class for projection of an RDF graph into an *algebraic object* such as a *vector*, *matrix*, or *tensor* representation, to support integration with non-RDF graph libraries. @@ -164,7 +164,7 @@ def check_attributes(self): self.nx_graph = self.build_nx_graph(nx.DiGraph()) -class SubgraphMatrix (Subgraph): +class SubgraphMatrix (Subgraph, AlgebraMixin, NetAnalysisMixin): """ Projection of a RDF graph to a [*matrix*](https://mathworld.wolfram.com/AdjacencyMatrix.html) representation. Typical use cases include integration with non-RDF graph libraries for *graph algorithms*. diff --git a/requirements.txt b/requirements.txt index aa8dd09..20c7aa8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,7 +8,7 @@ gcsfs >= 2022.2 icecream >= 2.1 morph-kgc >= 2.0.0 networkx >= 2.7 -numpy == 1.23.0 +numpy >= 1.23.0 owlrl >= 6.0.2 oxrdflib >= 0.3.1 pandas >= 1.4 @@ -21,7 +21,6 @@ pyvis >= 0.1.9 rdflib >= 6.1 requests >= 2.27 scikit-learn == 1.1.2 -scikit-network == 0.27.1 scipy >= 1.8.0 statsmodels >= 0.13 tqdm >= 4.63 diff --git a/tests/test_networks.py b/tests/test_networks.py index 2b83139..011970c 100644 --- a/tests/test_networks.py +++ b/tests/test_networks.py @@ -1,5 +1,6 @@ import pytest import numpy as np +from networkx.exception import NetworkXNoPath import kglab from kglab.subg import SubgraphMatrix, Subgraph @@ -47,22 +48,28 @@ def get_items(s): def test_distances_mtx(kg_test_data): subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) - dist = subgraph.get_distances(subgraph.to_scipy_sparse()) + dist = subgraph.get_distances(subgraph.to_adjacency()) np.testing.assert_allclose( dist[0,:6], - [0, 1, 1, 1, 1, 1] + [0, 2.44948974, 2.44948974, 2.44948974, 2.44948974, 2.44948974] ) def test_shortest_path(kg_test_data): subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) - dist = subgraph.get_shortest_path(subgraph.to_scipy_sparse(), 2, 6) - assert dist == [] + try: + subgraph.get_shortest_path(subgraph.to_adjacency(), 2, 6) + assert False + except NetworkXNoPath: + pass - dist = subgraph.get_shortest_path(subgraph.to_scipy_sparse(), 0, 2) + dist = subgraph.get_shortest_path(subgraph.to_adjacency(), 0, 2) assert dist == [0, 2] - dist = subgraph.get_shortest_path(subgraph.to_scipy_sparse(), 0, 7) - assert dist == [] + try: + dist = subgraph.get_shortest_path(subgraph.to_adjacency(), 0, 7) + assert False + except NetworkXNoPath: + pass \ No newline at end of file From c2a03140272700dc30c12594be123dec5e49a46b Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 11:21:27 +0100 Subject: [PATCH 02/11] update docstring --- examples/graph_algebra/gla_ex0_0.ipynb | 6 +++--- kglab/networks.py | 9 +++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/examples/graph_algebra/gla_ex0_0.ipynb b/examples/graph_algebra/gla_ex0_0.ipynb index 3a46c2d..a7fd661 100644 --- a/examples/graph_algebra/gla_ex0_0.ipynb +++ b/examples/graph_algebra/gla_ex0_0.ipynb @@ -23,7 +23,7 @@ { "data": { "text/plain": [ - "" + "" ] }, "execution_count": 1, @@ -138,7 +138,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -151,7 +151,7 @@ " [0., 0., 0., 1., 0.]])" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } diff --git a/kglab/networks.py b/kglab/networks.py index 0633610..0f2eb3d 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -16,16 +16,17 @@ class NetAnalysisMixin: def get_distances(self, adj_mtx): """ Compute distances according to an adjacency matrix. + + adj_mtx: +numpy.array: square matrix of distances. """ self.check_attributes() return squareform(pdist(adj_mtx, metric='euclidean')) - def get_shortest_path(self, adj_matx, src, dst): + def get_shortest_path(self, src, dst): """ -Return shortest path from sources to destinations according to an djacency matrix. +Return shortest path from sources to destinations. - adj_mtx: -numpy.array: adjacency matrix for the graph. src: int or iterable: indices of source nodes dst: From e39a1e29e84d2bdfe56a88eaa6f309ccb0640c3d Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 12:05:40 +0100 Subject: [PATCH 03/11] Add to notebook --- examples/graph_algebra/gla_ex0_0.ipynb | 93 ++++++++++++++++++++++---- kglab/algebra.py | 3 +- kglab/networks.py | 4 ++ 3 files changed, 87 insertions(+), 13 deletions(-) diff --git a/examples/graph_algebra/gla_ex0_0.ipynb b/examples/graph_algebra/gla_ex0_0.ipynb index a7fd661..77139f6 100644 --- a/examples/graph_algebra/gla_ex0_0.ipynb +++ b/examples/graph_algebra/gla_ex0_0.ipynb @@ -17,16 +17,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 1, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -34,6 +34,9 @@ "source": [ "# for use in tutorial and development; do not include this `sys.path` change in production:\n", "import sys ; sys.path.insert(0, \"../../\")\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", "from os.path import dirname\n", "import kglab\n", "import os\n", @@ -62,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -82,12 +85,12 @@ "## define a subgraph\n", "In this case we are looking for the network of parent-child relations among members of Vikings family.\n", "\n", - "With this query we can define a **subgraph** so to have access to **graph algebra** capabilities: " + "With this query we can define a __*subgraph* so to have access to *graph algebra* capabilities__: " ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -106,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -119,7 +122,7 @@ " [0., 0., 0., 0., 0.]])" ] }, - "execution_count": 4, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -133,12 +136,51 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "what happened here is that all the subjects and objects have been turned into integer indices from 0 to number of nodes. So we can see that the entity with index 0 is adjancent (is connected, has a directed edge) to the entity with index 1. This is a directed graph because the relationship `gorm:childOf` goes from child to parent, let's turn this into an undirected graph so to see the relation in a more symmetric way (both the child-parent and parent-child)." + "what happened here is that all the subjects and objects have been turned into integer indices from 0 to number of nodes. So we can see that the entity with index 0 is adjancent (is connected, has a directed edge) to the entity with index 1. This is a directed graph because the relationship `gorm:childOf` goes from child to parent, let's turn this into an undirected graph so to see the relation in a more symmetric way (both the child-parent and parent-child).\n", + "\n", + "We can check the labels attached to the matrix's indices with:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "http://example.org/sagas#Astrid\n", + "http://example.org/sagas#Leif\n", + "http://example.org/sagas#Bodil\n", + "http://example.org/sagas#Bjorn\n", + "http://example.org/sagas#Gorm\n" + ] + } + ], + "source": [ + "for i in range(adj_matrix.shape[0]):\n", + " print(\n", + " subgraph.inverse_transform(i) # returns a label from an index\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see from the matrix, assigning labels to the indices, for examples that: Leif and Bodil are child of Astrid.\n", + "\n", + "This is one of the great functionality provided by the semantic layer (data that is represented by W3C Linked Data standard), to represent relationships in both human-understandable and machine-readable way.\n", + "\n", + "## other relevant matrices for a graph\n", + "\n", + "To compute the *vertices degrees matrix* we need to port our directed graph (semantic data graph are always directed as by design triples are `subject->relation->object`) into an undirected ones. This obviously preserve the existence of the relationships but not their direction." ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -151,7 +193,7 @@ " [0., 0., 0., 1., 0.]])" ] }, - "execution_count": 5, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -165,7 +207,34 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see now the relationship is a generic symmetric \"parenthood\" relations, not just a child-parent directed relationship." + "We can see now the relationship is a generic symmetric \"parenthood\" relations, not just a child-parent directed relationship. We can still say that: Leif and Bodil and Astrid are first-degree kins (parent-child or siblings). \n", + "\n", + "Same easy way we can compute the vertices degrees matrix:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 2, -1, -1, 0, 0],\n", + " [-1, 2, 0, -1, 0],\n", + " [-1, 0, 1, 0, 0],\n", + " [ 0, -1, 0, 2, -1],\n", + " [ 0, 0, 0, -1, 1]])" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "laplacian = subgraph.to_laplacian()\n", + "laplacian" ] } ], diff --git a/kglab/algebra.py b/kglab/algebra.py index 2ac39d8..5c0305e 100644 --- a/kglab/algebra.py +++ b/kglab/algebra.py @@ -46,7 +46,8 @@ def to_incidence(self): def to_laplacian(self): """ Return Laplacian matrix for the KG. Graph is turned into undirected. -[docs](https://networkx.org/documentation/stable/reference/generated/networkx.linalg.laplacianmatrix.laplacian_matrix.html) +[docs](https://networkx.org/documentation/stable/reference/generated/networkx.linalg.laplacianmatrix.laplacian_matrix.html). +Lapliacian is also known as vertices degrees matrix. returns: `numpy.array`: the array representation in `numpy` standard diff --git a/kglab/networks.py b/kglab/networks.py index 0f2eb3d..bcf7e7d 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -40,6 +40,10 @@ def get_shortest_path(self, src, dst): def describe(self): # number of nodes, number of edges + # + # center + # diameter + # eccentricity # density # triangles # reciprocity From 2bb939d3a48f7ba19e60497e27484ba29ed1d063 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 12:25:05 +0100 Subject: [PATCH 04/11] Fix test --- tests/test_networks.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_networks.py b/tests/test_networks.py index 011970c..249bbf8 100644 --- a/tests/test_networks.py +++ b/tests/test_networks.py @@ -58,16 +58,16 @@ def test_shortest_path(kg_test_data): subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) try: - subgraph.get_shortest_path(subgraph.to_adjacency(), 2, 6) + subgraph.get_shortest_path(2, 6) assert False except NetworkXNoPath: pass - dist = subgraph.get_shortest_path(subgraph.to_adjacency(), 0, 2) + dist = subgraph.get_shortest_path(0, 2) assert dist == [0, 2] try: - dist = subgraph.get_shortest_path(subgraph.to_adjacency(), 0, 7) + dist = subgraph.get_shortest_path(0, 7) assert False except NetworkXNoPath: pass From b1029803d0c693cb85190c4c2c449a2b7c1bce2e Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 15:01:04 +0100 Subject: [PATCH 05/11] Implement get_nodes and get_edges --- kglab/networks.py | 14 +++++++++++++- kglab/subg.py | 8 ++++++++ tests/test_algebra_basic.py | 6 ++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/kglab/networks.py b/kglab/networks.py index bcf7e7d..c3c1263 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -39,11 +39,23 @@ def get_shortest_path(self, src, dst): return nx.shortest_path(self.nx_graph, source=src, target=dst) def describe(self): - # number of nodes, number of edges + """ +Return a summary for subgraph statistics. + + return: +dict: a dictionary with stats + """ + + return { + "n_nodes": self._get_n_nodes(), + "n_edges": self._get_n_edges() # # center # diameter # eccentricity + } + + def describe_more(self): # density # triangles # reciprocity diff --git a/kglab/subg.py b/kglab/subg.py index d61a9dc..0443bb9 100644 --- a/kglab/subg.py +++ b/kglab/subg.py @@ -11,6 +11,7 @@ import pandas as pd # type: ignore import pyvis.network # type: ignore import networkx as nx # type: ignore +import numpy as np # type: ignore from kglab import KnowledgeGraph from kglab.topo import Measure @@ -336,6 +337,13 @@ def build_ig_graph ( ig_graph.vs["label"] = ig_graph.vs["name"] # pylint: disable=E1136,E1137 return ig_graph + + def _get_n_nodes(self): + return self.to_adjacency().shape[0] + + def _get_n_edges(self): + return int(np.sum(self.to_adjacency())) + class SubgraphTensor (Subgraph): diff --git a/tests/test_algebra_basic.py b/tests/test_algebra_basic.py index 0a8c42d..92f8235 100644 --- a/tests/test_algebra_basic.py +++ b/tests/test_algebra_basic.py @@ -92,3 +92,9 @@ def test_scipy_sparse(kg_test_data): not_set_ = ((8, 8), (10, 6), (10, 10), (8, 1), (249, 2)) assert all(i in get_items(n_array) for i in set_) assert all(i not in get_items(n_array) for i in not_set_) + +def test_get_numbers(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + + assert subgraph._get_n_nodes() == 256 + assert subgraph._get_n_edges() == 1078 From eca7c8883390ccddbdb4d82b73841eebc83c7332 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 15:28:03 +0100 Subject: [PATCH 06/11] Implement SubgraphMatrix.describe() --- kglab/networks.py | 23 +++++++++++++++++------ tests/test_networks.py | 5 +++++ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/kglab/networks.py b/kglab/networks.py index c3c1263..e5695e1 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -7,6 +7,7 @@ """ import networkx as nx +from networkx.exception import NetworkXError from scipy.spatial.distance import pdist, squareform class NetAnalysisMixin: @@ -41,20 +42,30 @@ def get_shortest_path(self, src, dst): def describe(self): """ Return a summary for subgraph statistics. +NOTE: we may cache these methods calls if we create something like a `GraphFrame` object. + see kglab#273, same for adjacency and other matrices. return: dict: a dictionary with stats """ + def wrap(f, g, r): + try: + return f(g) + except NetworkXError as e: + r[f"{str(f.__name__)}_msg"] = str(e) + return None - return { + results = { "n_nodes": self._get_n_nodes(), - "n_edges": self._get_n_edges() - # - # center - # diameter - # eccentricity + "n_edges": self._get_n_edges(), } + return { **results, **{ + "center": wrap(nx.center, self.nx_graph, results), + "diameter": wrap(nx.diameter, self.nx_graph, results), + "eccentricity": wrap(nx.eccentricity, self.nx_graph, results) + }} + def describe_more(self): # density # triangles diff --git a/tests/test_networks.py b/tests/test_networks.py index 249bbf8..3bc8bc4 100644 --- a/tests/test_networks.py +++ b/tests/test_networks.py @@ -72,4 +72,9 @@ def test_shortest_path(kg_test_data): except NetworkXNoPath: pass +def test_describe(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + print(subgraph.describe()) + + \ No newline at end of file From 4c8e6390cf84015bcde26e144232096f6a3e2c1a Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 15:30:20 +0100 Subject: [PATCH 07/11] improve dosctring and method name --- kglab/networks.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kglab/networks.py b/kglab/networks.py index e5695e1..dca337a 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -48,7 +48,8 @@ def describe(self): return: dict: a dictionary with stats """ - def wrap(f, g, r): + def msg_if_raise(f, g, r): + """Handle error messages by adding a message key in the results""" try: return f(g) except NetworkXError as e: @@ -61,9 +62,9 @@ def wrap(f, g, r): } return { **results, **{ - "center": wrap(nx.center, self.nx_graph, results), - "diameter": wrap(nx.diameter, self.nx_graph, results), - "eccentricity": wrap(nx.eccentricity, self.nx_graph, results) + "center": msg_if_raise(nx.center, self.nx_graph, results), + "diameter": msg_if_raise(nx.diameter, self.nx_graph, results), + "eccentricity": msg_if_raise(nx.eccentricity, self.nx_graph, results) }} def describe_more(self): From edc98ccd36fb2e0dab5875675647f8a1f72e8300 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Wed, 7 Sep 2022 15:31:49 +0100 Subject: [PATCH 08/11] Add describe to example notebook --- examples/graph_algebra/gla_ex0_0.ipynb | 51 ++++++++++++++++++++------ 1 file changed, 39 insertions(+), 12 deletions(-) diff --git a/examples/graph_algebra/gla_ex0_0.ipynb b/examples/graph_algebra/gla_ex0_0.ipynb index 77139f6..2649cba 100644 --- a/examples/graph_algebra/gla_ex0_0.ipynb +++ b/examples/graph_algebra/gla_ex0_0.ipynb @@ -17,16 +17,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 9, + "execution_count": 1, "metadata": {}, "output_type": "execute_result" } @@ -65,7 +65,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -90,7 +90,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -122,7 +122,7 @@ " [0., 0., 0., 0., 0.]])" ] }, - "execution_count": 12, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -143,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -180,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -193,7 +193,7 @@ " [0., 0., 0., 1., 0.]])" ] }, - "execution_count": 15, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -214,7 +214,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -227,7 +227,7 @@ " [ 0, 0, 0, -1, 1]])" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -236,6 +236,33 @@ "laplacian = subgraph.to_laplacian()\n", "laplacian" ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_nodes': 5,\n", + " 'n_edges': 4,\n", + " 'center_msg': 'Found infinite path length because the digraph is not strongly connected',\n", + " 'diameter_msg': 'Found infinite path length because the digraph is not strongly connected',\n", + " 'eccentricity_msg': 'Found infinite path length because the digraph is not strongly connected',\n", + " 'center': None,\n", + " 'diameter': None,\n", + " 'eccentricity': None}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subgraph.describe()" + ] } ], "metadata": { From 4685e5598022a2b00efbec76a5f242f07ee602c0 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Thu, 8 Sep 2022 11:31:59 +0100 Subject: [PATCH 09/11] Add to notebook --- examples/graph_algebra/gla_ex0_0.ipynb | 342 ++++++++++++++++++++++--- 1 file changed, 310 insertions(+), 32 deletions(-) diff --git a/examples/graph_algebra/gla_ex0_0.ipynb b/examples/graph_algebra/gla_ex0_0.ipynb index 2649cba..b2a1d01 100644 --- a/examples/graph_algebra/gla_ex0_0.ipynb +++ b/examples/graph_algebra/gla_ex0_0.ipynb @@ -17,16 +17,16 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 1, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -65,16 +65,81 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 51, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
subjectobject
0gorm:Astridgorm:Leif
1gorm:Astridgorm:Bodil
2gorm:Leifgorm:Bjorn
3gorm:Bjorngorm:Gorm
\n", + "
" + ], + "text/plain": [ + " subject object\n", + "0 gorm:Astrid gorm:Leif\n", + "1 gorm:Astrid gorm:Bodil\n", + "2 gorm:Leif gorm:Bjorn\n", + "3 gorm:Bjorn gorm:Gorm" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "query = \"\"\"SELECT ?subject ?object\n", "WHERE {\n", " ?subject rdf:type gorm:Viking .\n", " ?subject gorm:childOf ?object .\n", "}\n", - "\"\"\"" + "\"\"\"\n", + "df = kg.query_as_df(query)\n", + "df" ] }, { @@ -90,7 +155,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ @@ -109,7 +174,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -122,7 +187,7 @@ " [0., 0., 0., 0., 0.]])" ] }, - "execution_count": 4, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -143,25 +208,27 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "http://example.org/sagas#Astrid\n", - "http://example.org/sagas#Leif\n", - "http://example.org/sagas#Bodil\n", - "http://example.org/sagas#Bjorn\n", - "http://example.org/sagas#Gorm\n" + "index -> label\n", + "0 -> http://example.org/sagas#Astrid\n", + "1 -> http://example.org/sagas#Leif\n", + "2 -> http://example.org/sagas#Bodil\n", + "3 -> http://example.org/sagas#Bjorn\n", + "4 -> http://example.org/sagas#Gorm\n" ] } ], "source": [ + "print(\"index\", \"->\", \"label\")\n", "for i in range(adj_matrix.shape[0]):\n", " print(\n", - " subgraph.inverse_transform(i) # returns a label from an index\n", + " i, \"->\", subgraph.inverse_transform(i) # returns a label from an index\n", " )" ] }, @@ -169,9 +236,44 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can see from the matrix, assigning labels to the indices, for examples that: Leif and Bodil are child of Astrid.\n", + "We can see from the matrix, assigning labels to the indices, for examples that: Astrid is child of Leif and Bodil.\n", "\n", "This is one of the great functionality provided by the semantic layer (data that is represented by W3C Linked Data standard), to represent relationships in both human-understandable and machine-readable way.\n", + "\n", + "Another useful method is `describe()`, that returns some statistics of the graph if they can be computed:" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'n_nodes': 5,\n", + " 'n_edges': 4,\n", + " 'center_msg': 'Found infinite path length because the digraph is not strongly connected',\n", + " 'diameter_msg': 'Found infinite path length because the digraph is not strongly connected',\n", + " 'eccentricity_msg': 'Found infinite path length because the digraph is not strongly connected',\n", + " 'center': None,\n", + " 'diameter': None,\n", + " 'eccentricity': None}" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subgraph.describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ "\n", "## other relevant matrices for a graph\n", "\n", @@ -180,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 56, "metadata": {}, "outputs": [ { @@ -193,7 +295,7 @@ " [0., 0., 0., 1., 0.]])" ] }, - "execution_count": 6, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -214,7 +316,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 57, "metadata": {}, "outputs": [ { @@ -227,7 +329,7 @@ " [ 0, 0, 0, -1, 1]])" ] }, - "execution_count": 7, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -237,31 +339,207 @@ "laplacian" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An incidence, or edge matrix `E`, uses by convention the rows to represent every node in the graph and the columns represent every edge. Some other convention does the opposite." + ] + }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'n_nodes': 5,\n", - " 'n_edges': 4,\n", - " 'center_msg': 'Found infinite path length because the digraph is not strongly connected',\n", - " 'diameter_msg': 'Found infinite path length because the digraph is not strongly connected',\n", - " 'eccentricity_msg': 'Found infinite path length because the digraph is not strongly connected',\n", - " 'center': None,\n", - " 'diameter': None,\n", - " 'eccentricity': None}" + "array([[1., 1., 0., 0.],\n", + " [1., 0., 1., 0.],\n", + " [0., 1., 0., 0.],\n", + " [0., 0., 1., 1.],\n", + " [0., 0., 0., 1.]])" ] }, - "execution_count": 8, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "subgraph.describe()" + "incidence = subgraph.to_incidence()\n", + "incidence" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## try another query\n", + "Let's try the same tools with another query that will define a different subgraph from the main graph: " + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "query2 = \"\"\"SELECT ?subject ?object\n", + "WHERE {\n", + " ?subject rdf:type gorm:Viking .\n", + " ?subject gorm:spouseOf ?object .\n", + "}\n", + "\"\"\"\n", + "df = kg.query_as_df(query2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This time we try a *symmetric* relation `gorm:spouseOf`. Let's try to understand better how the RDF definition of our semantic layer works:" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "@prefix foaf: .\n", + "@prefix gorm: .\n", + "@prefix owl: .\n", + "@prefix rdfs: .\n", + "@prefix skos: .\n", + "\n", + "gorm:Astrid a gorm:Viking ;\n", + " gorm:childOf gorm:Bodil,\n", + " gorm:Leif ;\n", + " foaf:topic_interest gorm:Fighting .\n", + "\n", + "gorm:childOf rdfs:domain gorm:Viking ;\n", + " rdfs:range gorm:Viking ;\n", + " owl:inverseOf gorm:ancestorOf .\n", + "\n", + "gorm:spouseOf a owl:SymmetricProperty ;\n", + " rdfs:domain gorm:Viking ;\n", + " rdfs:range gorm:Viking .\n", + "\n", + "gorm:Berserkr a foaf:Thing ;\n", + " skos:broader gorm:Fighting .\n", + "\n", + "gorm:Bjorn a gorm:Viking ;\n", + " gorm:childOf gorm:Gorm ;\n", + " foaf:topic_interest gorm:Pilaging .\n", + "\n", + "gorm:Bodil a gorm:Viking ;\n", + " gorm:spouseOf gorm:Leif .\n", + "\n", + "gorm:Gorm a gorm:Viking ;\n", + " foaf:topic_interest gorm:Berserkr .\n", + "\n", + "gorm:Pilaging a foaf:Thing ;\n", + " skos:broader gorm:Fighting .\n", + "\n", + "gorm:Leif a gorm:Viking ;\n", + " gorm:childOf gorm:Bjorn .\n", + "\n", + "gorm:Fighting a foaf:Thing .\n", + "\n", + "gorm:Viking a foaf:Person .\n", + "\n", + "\n" + ] + } + ], + "source": [ + "text = kg.save_rdf_text()\n", + "print(text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see `gorm:spouseOf` is defined as what the OWL standard calls a `owl:SymmetricProperty`, for this relation **domain** (the definition of the set of subject) and **range** (the definition of the set of object) are the same: so the triple looks like: `gorm:Viking`->`gorm:spouseOf`->`gorm:Viking`. Let's what data this relation returns." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1.],\n", + " [0., 0.]])" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "subgraph2 = SubgraphMatrix(kg=kg, sparql=query2)\n", + "A = subgraph2.to_adjacency()\n", + "A" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 -> http://example.org/sagas#Bodil\n", + "1 -> http://example.org/sagas#Leif\n" + ] + } + ], + "source": [ + "# Labels\n", + "for i in range(A.shape[0]):\n", + " print(\n", + " i, \"->\", subgraph2.inverse_transform(i) # returns a label from an index\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Turning this matrix representation into undirected, we can read more generic \"is married to\" relation:" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[0., 1.],\n", + " [1., 0.]])" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "undirected = subgraph2.to_undirected()\n", + "undirected" ] } ], From 0d3faa15f5c0f2b2cefa96c2c9df00627c822734 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Thu, 8 Sep 2022 11:59:11 +0100 Subject: [PATCH 10/11] Add test for subg module --- kglab/subg.py | 6 ++-- tests/test_algebra_basic.py | 3 ++ tests/test_subg.py | 59 +++++++++++++++++++++++++++++++++++++ 3 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 tests/test_subg.py diff --git a/kglab/subg.py b/kglab/subg.py index 0443bb9..c70cb82 100644 --- a/kglab/subg.py +++ b/kglab/subg.py @@ -233,7 +233,7 @@ def build_df ( if self.sparql is None and self.kg.use_gpus is True: raise ValueError("""To use GPUs is necessary to provide a SPARQL query to define a subgraph: - `kglab.SubgraphMatrix(kg, sparql)` or `SubgraphTensor`""") + `kglab.SubgraphMatrix(kg, sparql)` or `SubgraphTensor(...)`""") row_iter = self.kg.query(self.sparql, bindings=self.bindings) if not show_symbols: @@ -257,7 +257,7 @@ def build_df ( for row in row_iter ] - if self.kg.use_gpus: + if self.kg.use_gpus is True: df = cudf.DataFrame(rows_list, columns=col_names) else: df = pd.DataFrame(rows_list, columns=col_names) @@ -284,7 +284,7 @@ def build_nx_graph ( returns: the populated `NetworkX` graph object; uses the [RAPIDS `cuGraph` library](https://docs.rapids.ai/api/cugraph/stable/) if GPUs are enabled """ - if self.kg.use_gpus: + if self.kg.use_gpus is True: df = self.build_df() nx_graph.from_cudf_edgelist(df, source="src", destination="dst") else: diff --git a/tests/test_algebra_basic.py b/tests/test_algebra_basic.py index 92f8235..b675b04 100644 --- a/tests/test_algebra_basic.py +++ b/tests/test_algebra_basic.py @@ -95,6 +95,9 @@ def test_scipy_sparse(kg_test_data): def test_get_numbers(kg_test_data): subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + subgraph.check_attributes() assert subgraph._get_n_nodes() == 256 assert subgraph._get_n_edges() == 1078 + assert subgraph.nx_graph.number_of_nodes() == 256 + assert subgraph.nx_graph.number_of_edges() == 1078 diff --git a/tests/test_subg.py b/tests/test_subg.py new file mode 100644 index 0000000..0522432 --- /dev/null +++ b/tests/test_subg.py @@ -0,0 +1,59 @@ +import pytest +import networkx as nx + +import kglab +from kglab.subg import SubgraphMatrix + +from .__init__ import DAT_FILES_DIR + + +@pytest.fixture() +def kg_test_data(): + namespaces = { + "nom": "http://example.org/#", + "wtm": "http://purl.org/heals/food/", + "ind": "http://purl.org/heals/ingredient/", + "skos": "http://www.w3.org/2004/02/skos/core#", + } + + kg = kglab.KnowledgeGraph( + name = "A recipe KG example based on Food.com", + base_uri = "https://www.food.com/recipe/", + namespaces = namespaces, + ) + + kg.load_rdf(DAT_FILES_DIR / "tmp.ttl") + + yield kg + + del kg + +# +# A query that defines a subgraph as subject -> object +# +QUERY1 = """ +SELECT ?subject ?object +WHERE { + ?subject rdf:type wtm:Recipe . + ?subject wtm:hasIngredient ?object . +} +""" + +def test_build_df_no_cuda(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + subgraph.kg.use_gpus = False + + df = subgraph.build_df() + results = list(df.itertuples(index=False, name=None)) + + expected = ((254, 3), (254,4), (255, 1), (255, 2), (255, 6)) + assert all(r == e for r in results[-1:-5] for e in expected) + +def test_build_nx_graph(kg_test_data): + subgraph = SubgraphMatrix(kg=kg_test_data, sparql=QUERY1) + subgraph.kg.use_gpus = False + + nxg = nx.DiGraph() + subgraph.build_nx_graph(nxg) + + assert nxg.number_of_nodes() == 256 and nxg.number_of_edges() == 1078 From 4464ebb7cf7eef2ae33374d81911d372d02c8656 Mon Sep 17 00:00:00 2001 From: "Lorenzo (Mec-iS)" Date: Thu, 8 Sep 2022 14:19:48 +0100 Subject: [PATCH 11/11] Implement typing for mixin --- kglab/__init__.py | 7 +++--- kglab/algebra.py | 12 ++++++++-- kglab/kglab.py | 38 ++++++++++++++--------------- kglab/networks.py | 9 +++++-- kglab/query/mixin.py | 11 ++++----- kglab/serde.py | 38 ++++++++++++++--------------- kglab/standards.py | 57 +++++++++++++++++++------------------------- kglab/subg.py | 32 ++++++++++++------------- kglab/util.py | 19 +++++++++++++++ 9 files changed, 122 insertions(+), 101 deletions(-) diff --git a/kglab/__init__.py b/kglab/__init__.py index afad3fd..32e24b8 100644 --- a/kglab/__init__.py +++ b/kglab/__init__.py @@ -1,7 +1,6 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# see license https://github.com/DerwenAI/kglab#license-and-copyright - +""" +see license https://github.com/DerwenAI/kglab#license-and-copyright +""" from .kglab import KnowledgeGraph from .graph import NodeRef, PropertyStore diff --git a/kglab/algebra.py b/kglab/algebra.py index 5c0305e..39e738c 100644 --- a/kglab/algebra.py +++ b/kglab/algebra.py @@ -10,7 +10,9 @@ import networkx as nx from networkx import DiGraph -class AlgebraMixin: +from kglab.util import Mixin + +class AlgebraMixin(Mixin): """ Provides methods to work with graph algebra using `SubgraphMatrix` data. @@ -19,6 +21,12 @@ class AlgebraMixin: nx_graph: typing.Optional[DiGraph] = None def to_undirected(self): + """ +Return the undirected adjancency matrix of the directed graph. + + returns: +`numpy.array`: the array representation in `numpy` standard + """ return nx.to_numpy_array(self.nx_graph.to_undirected()) def to_adjacency(self): @@ -60,7 +68,7 @@ def to_scipy_sparse(self): Return graph in CSR format (optimized for matrix-matrix operations). returns: -SciPy sparse matrix: Graph adjacency matrix. +SciPy sparse matrix: Graph adjacency matrix. """ self.check_attributes() return nx.to_scipy_sparse_array(self.nx_graph) diff --git a/kglab/kglab.py b/kglab/kglab.py index cee7467..2156179 100644 --- a/kglab/kglab.py +++ b/kglab/kglab.py @@ -40,7 +40,7 @@ class KnowledgeGraph(QueryingMixin, SerdeMixin, ShaclOwlRdfSkosMixin): Core feature areas include: - * namespace management: ontology, controlled vocabularies + * namespace management: ontology, controlled vocabularies * graph construction * serialization-deserilization (see `serde` module) * SPARQL querying (see `query.mixin` module) @@ -65,12 +65,12 @@ def __init__ ( self, *, name: str = "generic", - base_uri: str = None, + base_uri: typing.Optional[str] = None, language: str = "en", - store: str = None, + store: typing.Optional[str] = None, use_gpus: bool = True, import_graph: typing.Optional[GraphLike] = None, - namespaces: dict = None, + namespaces: typing.Optional[dict] = None, ) -> None: """ Constructor for a `KnowledgeGraph` object. @@ -112,17 +112,17 @@ def __init__ ( if import_graph is not None: self._g = import_graph else: - self._g = self.build_blank_graph() + self._g = self.build_blank_graph() # pylint: disable=E1101 # initialize the namespaces self._ns: dict = {} for prefix, iri in self._DEFAULT_NAMESPACES.items(): - self.add_ns(prefix, iri) + self.add_ns(prefix, iri) # pylint: disable=E1101 if namespaces: for prefix, iri in namespaces.items(): - self.add_ns(prefix, iri) + self.add_ns(prefix, iri) # pylint: disable=E1101 # backwards compatibility for class refactoring self.sparql = kglab.query.sparql.SparqlQueryable(self) @@ -203,7 +203,7 @@ def add_ns ( if replace or prefix not in self._ns: self._ns[prefix] = rdflib.Namespace(iri) - self._g.namespace_manager.bind( + self._g.namespace_manager.bind( # type: ignore prefix, self._ns[prefix], override=override, @@ -241,7 +241,7 @@ def get_ns_dict ( for prefix, ns in self._ns.items() } - nm = self._g.namespace_manager + nm = self._g.namespace_manager # type: ignore for prefix, uri in nm.namespaces(): ns_dict[prefix] = str(uri) @@ -314,7 +314,7 @@ def encode_date ( [`rdflib.Literal`](https://rdflib.readthedocs.io/en/stable/rdf_terms.html#literals) formatted as an XML Schema 2 `dateTime` value """ date_tz = dup.parse(dt, tzinfos=tzinfos) - return rdflib.Literal(date_tz, datatype=self.get_ns("xsd").dateTime) + return rdflib.Literal(date_tz, datatype=self.get_ns("xsd").dateTime) # pylint: disable=E1101 def add ( @@ -342,7 +342,7 @@ def add ( must be a [`rdflib.term.Node`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=Node#rdflib.term.Node) or [`rdflib.term.Terminal`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=Node#rdflib.term.Literal); otherwise throws a `TypeError` exception """ try: - self._g.add((s, p, o,)) + self._g.add((s, p, o,)) # type: ignore except AssertionError as e: traceback.print_exc() ic(s) @@ -376,7 +376,7 @@ def remove ( must be a [`rdflib.term.Node`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=Node#rdflib.term.Node) or [`rdflib.term.Terminal`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html?highlight=Node#rdflib.term.Literal); otherwise throws a `TypeError` exception """ try: - self._g.remove((s, p, o,)) + self._g.remove((s, p, o,)) # type: ignore except AssertionError as e: traceback.print_exc() ic(s) @@ -386,7 +386,7 @@ def remove ( def graph_factory(self, name, graph): - """ + """ Utility function to generate graphs from mixins name: @@ -394,9 +394,9 @@ def graph_factory(self, name, graph): graph: initial graph - """ - return KnowledgeGraph( - name=name, - namespaces=self.get_ns_dict(), - import_graph=graph, - ) \ No newline at end of file + """ + return KnowledgeGraph( + name=name, + namespaces=self.get_ns_dict(), + import_graph=graph, + ) diff --git a/kglab/networks.py b/kglab/networks.py index dca337a..47cf6d1 100644 --- a/kglab/networks.py +++ b/kglab/networks.py @@ -10,7 +10,9 @@ from networkx.exception import NetworkXError from scipy.spatial.distance import pdist, squareform -class NetAnalysisMixin: +from kglab.util import Mixin + +class NetAnalysisMixin(Mixin): """ Provides methods for network analysis tools to work with `KnowledgeGraph`. """ @@ -68,7 +70,10 @@ def msg_if_raise(f, g, r): }} def describe_more(self): + """ +Return a summary with more graph statistics. + """ # density # triangles # reciprocity - raise NotImplementedError() \ No newline at end of file + raise NotImplementedError() diff --git a/kglab/query/mixin.py b/kglab/query/mixin.py index e0f92c6..aa2b57a 100644 --- a/kglab/query/mixin.py +++ b/kglab/query/mixin.py @@ -9,19 +9,18 @@ import typing ### third-parties libraries -from icecream import ic # type: ignore import pandas as pd # type: ignore import pyvis # type: ignore import rdflib # type: ignore import rdflib.plugin # type: ignore -import rdflib.plugins.parsers.notation3 as rdf_n3 # type: ignore ## kglab - core classes from kglab.pkg_types import RDF_Node from kglab.gpviz import GPViz from kglab.util import get_gpu_count from kglab.version import _check_version +from kglab.util import Mixin ## pre-constructor set-up @@ -31,7 +30,7 @@ import cudf # type: ignore # pylint: disable=E0401 -class QueryingMixin: +class QueryingMixin(Mixin): """ This class implements querying for `KnowledgeGraph` @@ -65,7 +64,7 @@ def query ( if not bindings: bindings = {} - for row in self._g.query( + for row in self._g.query( # type: ignore sparql, initBindings=bindings, ): @@ -102,7 +101,7 @@ def query_as_df ( if not bindings: bindings = {} - row_iter = self._g.query(sparql, initBindings=bindings) + row_iter = self._g.query(sparql, initBindings=bindings) # type: ignore if simplify: rows_list = [ self.n3fy_row(r.asdict(), pythonify=pythonify) for r in row_iter ] @@ -136,7 +135,7 @@ def visualize_query ( returns: PyVis network object, to be rendered """ - return GPViz(sparql, self._ns).visualize_query(notebook=notebook) + return GPViz(sparql, self._ns).visualize_query(notebook=notebook) # type: ignore def n3fy ( diff --git a/kglab/serde.py b/kglab/serde.py index da9d872..fb16f3d 100644 --- a/kglab/serde.py +++ b/kglab/serde.py @@ -28,6 +28,7 @@ from kglab.pkg_types import IOPathLike, PathLike from kglab.util import get_gpu_count from kglab.version import _check_version +from kglab.util import Mixin ## pre-constructor set-up @@ -37,7 +38,7 @@ import cudf # type: ignore # pylint: disable=E0401 -class SerdeMixin: +class SerdeMixin(Mixin): """ Provide serialization and deserialization methods for `KnowledgeGraph`: * RDF @@ -47,7 +48,6 @@ class SerdeMixin: * Morph-KGC * ROAM """ - ###################################################################### ## serialization ## @@ -156,7 +156,7 @@ def load_rdf ( format: str = "ttl", base: str = None, **args: typing.Any, - ) -> "KnowledgeGraph": + ) -> "KnowledgeGraph": # type: ignore """ Wrapper for [`rdflib.Graph.parse()`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.graph.Graph.parse) which parses an RDF graph from the `path` source. This traps some edge cases for the several source-ish parameters in RDFlib which had been overloaded. @@ -188,14 +188,14 @@ def load_rdf ( try: if hasattr(path, "read"): - self._g.parse( + self._g.parse( # type: ignore path, format=format, publicID=base, **args, ) else: - self._g.parse( + self._g.parse( # type: ignore self._get_filename(path), format=format, publicID=base, @@ -215,7 +215,7 @@ def load_rdf_text ( format: str = "ttl", base: str = None, **args: typing.Any, - ) -> "KnowledgeGraph": + ) -> "KnowledgeGraph": # type: ignore """ Wrapper for [`rdflib.Graph.parse()`](https://rdflib.readthedocs.io/en/stable/apidocs/rdflib.html#rdflib.graph.Graph.parse) which parses an RDF graph from a text. This traps some edge cases for the several source-ish parameters in RDFlib which had been overloaded. @@ -241,7 +241,7 @@ def load_rdf_text ( if not base and self.base_uri: base = self.base_uri - self._g.parse( + self._g.parse( # type: ignore data=data, format=format, publicID=base, @@ -295,7 +295,7 @@ def save_rdf ( raise TypeError(self._ERROR_PATH) try: - self._g.serialize( + self._g.serialize( # type: ignore destination=path, format=format, base=base, @@ -307,7 +307,7 @@ def save_rdf ( # otherwise write to a local file reference else: - self._g.serialize( + self._g.serialize( # type: ignore destination=self._get_filename(path), format=format, base=base, @@ -365,7 +365,7 @@ def load_jsonld ( *, encoding: str = "utf-8", **args: typing.Any, - ) -> "KnowledgeGraph": + ) -> "KnowledgeGraph": # type: ignore """ Wrapper for [`rdflib-jsonld.parser.JsonLDParser.parse()`](https://github.com/RDFLib/rdflib-jsonld/blob/master/rdflib_jsonld/parser.py) which parses an RDF graph from a [JSON-LD](https://json-ld.org/) source. This traps some edge cases for the several source-ish parameters in RDFlib which had been overloaded. @@ -393,7 +393,7 @@ def load_jsonld ( # load JSON from file (to verify format and trap exceptions at # this level) then dump to string – which is expected by the # JSON-LD plugin for RDFlib - self._g.parse( + self._g.parse( # type: ignore data=json.dumps(json.load(f)), # type: ignore format="json-ld", encoding=encoding, @@ -433,7 +433,7 @@ def save_jsonld ( self._check_encoding(encoding) f.write( # type: ignore - self._g.serialize( + self._g.serialize( # type: ignore format="json-ld", context=self.get_context(), indent=2, @@ -454,7 +454,7 @@ def load_parquet ( self, path: IOPathLike, **kwargs: typing.Any, - ) -> "KnowledgeGraph": + ) -> "KnowledgeGraph": # type: ignore """ Wrapper for [`pandas.read_parquet()`](https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html?highlight=read_parquet#pandas.read_parquet) which parses an RDF graph represented as a [Parquet](https://parquet.apache.org/) file, using the [`pyarrow`](https://arrow.apache.org/) engine. Uses the [RAPIDS `cuDF` library](https://docs.rapids.ai/api/cudf/stable/) if GPUs are enabled. @@ -482,7 +482,7 @@ def load_parquet ( ) df.apply( - lambda row: self._g.parse( + lambda row: self._g.parse( # type: ignore data=f"{ row[0] } { row[1] } { row[2] } .", format="ttl", ), @@ -521,7 +521,7 @@ def save_parquet ( self._PARQUET_COL_NAMES[1]: p.n3(), self._PARQUET_COL_NAMES[2]: o.n3(), } - for s, p, o in self._g + for s, p, o in self._g # type: ignore ] if self.use_gpus: @@ -540,7 +540,7 @@ def save_parquet ( def load_csv ( self, url: str, - ) -> "KnowledgeGraph": + ) -> "KnowledgeGraph": # type: ignore """ Wrapper for [`csvwlib`](https://github.com/DerwenAI/csvwlib) which parses a CSV file from the `path` source, then converts to RDF and merges into this RDF graph. @@ -561,7 +561,7 @@ def load_csv ( def materialize ( self, config: str, - ) -> "KnowledgeGraph": + ) -> "KnowledgeGraph": # type: ignore """ Binding to the [Morph-KGC](https://github.com/oeg-upm/morph-kgc) `materialize()` method. @@ -571,14 +571,14 @@ def materialize ( returns: this `KnowledgeGraph` object – used for method chaining """ - if len(self._g) == 0: + if len(self._g) == 0: # type: ignore # generate the triples and load them to an RDFlib graph self._g = morph_kgc.materialize(config) else: # merge # for caveats about merging this way: # - self._g.parse(morph_kgc.materialize(config)) + self._g.parse(morph_kgc.materialize(config)) # type: ignore return self diff --git a/kglab/standards.py b/kglab/standards.py index 973d04f..0700ca9 100644 --- a/kglab/standards.py +++ b/kglab/standards.py @@ -8,30 +8,21 @@ import typing ### third-parties libraries -from icecream import ic # type: ignore import chocolate # type: ignore import owlrl # type: ignore import pyshacl # type: ignore -import rdflib.plugin # type: ignore - ## kglab - core classes from kglab.pkg_types import GraphLike -from kglab.util import get_gpu_count -from kglab.version import _check_version - - -## pre-constructor set-up -_check_version() +from kglab.util import Mixin -if get_gpu_count() > 0: - import cudf # type: ignore - -class ShaclOwlRdfSkosMixin: +class ShaclOwlRdfSkosMixin(Mixin): """ Provide methods for SHACL- OWL- and RDF-related operations. """ + _g: typing.Optional[GraphLike] + ###################################################################### ## SHACL validation @@ -47,7 +38,7 @@ def validate ( inplace:typing.Optional[bool] = True, abort_on_first: typing.Optional[bool] = None, **kwargs: typing.Any, - ) -> typing.Tuple[bool, "KnowledgeGraph", str]: + ) -> typing.Tuple[bool, "KnowledgeGraph", str]: # type: ignore """ Wrapper for [`pyshacl.validate()`](https://github.com/RDFLib/pySHACL) for validating the RDF graph using rules expressed in the [SHACL](https://www.w3.org/TR/shacl/) (Shapes Constraint Language). @@ -152,16 +143,16 @@ def infer_rdfs_properties ( # key: property val: set([superprop1, superprop2..]) super_props: typing.Dict[typing.Any, typing.Any] = {} - for s, o in self._g.subject_objects(_rdfs.subPropertyOf): + for s, o in self._g.subject_objects(_rdfs.subPropertyOf): # type: ignore super_props.setdefault(s, set()) - for sub_prop in self._g.transitive_objects(s, _rdfs.subPropertyOf): + for sub_prop in self._g.transitive_objects(s, _rdfs.subPropertyOf): # type: ignore if sub_prop != s: super_props[s].add(sub_prop) # add super-property relationships for p, sup_prop_list in super_props.items(): - for s, o in self._g.subject_objects(p): + for s, o in self._g.subject_objects(p): # type: ignore for sup_prop in sup_prop_list: self.add(s, sup_prop, o) @@ -180,16 +171,16 @@ def infer_rdfs_classes ( # key: class val: set([superclass1, superclass2..]) super_classes: typing.Dict[typing.Any, typing.Any] = {} - for s, _ in self._g.subject_objects(_rdfs.subClassOf): + for s, _ in self._g.subject_objects(_rdfs.subClassOf): # type: ignore super_classes.setdefault(s, set()) - for sup_class in self._g.transitive_objects(s, _rdfs.subClassOf): + for sup_class in self._g.transitive_objects(s, _rdfs.subClassOf): # type: ignore if sup_class != s: super_classes[s].add(sup_class) # set the superclass type information for subclass instances for s, sup_class_list in super_classes.items(): - for sub_inst in self._g.subjects(self.get_ns("rdf").type, s): + for sub_inst in self._g.subjects(self.get_ns("rdf").type, s): # type: ignore for sup_class in sup_class_list: self.add(sub_inst, self.get_ns("rdf").type, sup_class) @@ -210,7 +201,7 @@ def infer_skos_related ( """ _skos = self.get_ns("skos") - for s, o in self._g.subject_objects(_skos.related): + for s, o in self._g.subject_objects(_skos.related): # type: ignore self.add(o, _skos.related, s) @@ -228,13 +219,13 @@ def infer_skos_concept ( """ _skos = self.get_ns("skos") - for s, o in self._g.subject_objects(_skos.hasTopConcept): + for s, o in self._g.subject_objects(_skos.hasTopConcept): # type: ignore self.add(o, _skos.topConceptOf, s) - for s, o in self._g.subject_objects(_skos.topConceptOf): + for s, o in self._g.subject_objects(_skos.topConceptOf): # type: ignore self.add(o, _skos.hasTopConcept, s) - for s, o in self._g.subject_objects(_skos.topConceptOf): + for s, o in self._g.subject_objects(_skos.topConceptOf): # type: ignore self.add(s, _skos.inScheme, o) @@ -255,10 +246,10 @@ def infer_skos_hierarchical ( _skos = self.get_ns("skos") if narrower: - for s, o in self._g.subject_objects(_skos.broader): + for s, o in self._g.subject_objects(_skos.broader): # type: ignore self.add(o, _skos.narrower, s) - for s, o in self._g.subject_objects(_skos.narrower): + for s, o in self._g.subject_objects(_skos.narrower): # type: ignore self.add(o, _skos.broader, s) if not narrower: @@ -285,8 +276,8 @@ def infer_skos_transitive ( """ _skos = self.get_ns("skos") - for concept in self._g.subjects(self.get_ns("rdf").type, _skos.Concept): - for broader_concept in self._g.transitive_objects(concept, _skos.broader): + for concept in self._g.subjects(self.get_ns("rdf").type, _skos.Concept): # type: ignore + for broader_concept in self._g.transitive_objects(concept, _skos.broader): # type: ignore if broader_concept != concept: self.add(concept, _skos.broaderTransitive, broader_concept) @@ -311,17 +302,17 @@ def infer_skos_symmetric_mappings ( """ _skos = self.get_ns("skos") - for s, o in self._g.subject_objects(_skos.relatedMatch): + for s, o in self._g.subject_objects(_skos.relatedMatch): # type: ignore self.add(o, _skos.relatedMatch, s) if related: self.add(s, _skos.related, o) self.add(o, _skos.related, s) - for s, o in self._g.subject_objects(_skos.closeMatch): + for s, o in self._g.subject_objects(_skos.closeMatch): # type: ignore self.add(o, _skos.closeMatch, s) - for s, o in self._g.subject_objects(_skos.exactMatch): + for s, o in self._g.subject_objects(_skos.exactMatch): # type: ignore self.add(o, _skos.exactMatch, s) @@ -344,14 +335,14 @@ def infer_skos_hierarchical_mappings ( """ _skos = self.get_ns("skos") - for s, o in self._g.subject_objects(_skos.broadMatch): + for s, o in self._g.subject_objects(_skos.broadMatch): # type: ignore self.add(s, _skos.broader, o) if narrower: self.add(o, _skos.narrowMatch, s) self.add(o, _skos.narrower, s) - for s, o in self._g.subject_objects(_skos.narrowMatch): + for s, o in self._g.subject_objects(_skos.narrowMatch): # type: ignore self.add(o, _skos.broadMatch, s) self.add(o, _skos.broader, s) diff --git a/kglab/subg.py b/kglab/subg.py index c70cb82..a137030 100644 --- a/kglab/subg.py +++ b/kglab/subg.py @@ -28,11 +28,11 @@ class Subgraph: """ -Base class for projection of an RDF graph into an *algebraic object* such as a *vector*, -*matrix*, or *tensor* representation, to support integration with non-RDF graph libraries. -In other words, this class provides means to vectorize selected portions of a graph as a -[*dimension*](https://mathworld.wolfram.com/Dimension.html). -See +Base class for projection of an RDF graph into an *algebraic object* such as a *vector*, + *matrix*, or *tensor* representation, to support integration with non-RDF graph libraries. + In other words, this class provides means to vectorize selected portions of a graph as a + [*dimension*](https://mathworld.wolfram.com/Dimension.html). + See Features support several areas of use cases, including: @@ -47,7 +47,7 @@ class Subgraph: a *vector*, in the `node_vector` member. This provides an efficient *index* on a constructed *dimension*, solely for the context of a specific use case. """ - kg: typing.Optional[KnowledgeGraph] = None + kg: KnowledgeGraph nx_graph: typing.Optional[nx.DiGraph] = None def __init__ ( @@ -159,13 +159,13 @@ def check_attributes(self): `kglab.Subgraph(kg)`""" ) - # create an empy `nx.DiGraph` if none is present + # create an empty `nx.DiGraph` if none is present if self.nx_graph is None: # NOTE: find a way to pass `bipartite` if needed - self.nx_graph = self.build_nx_graph(nx.DiGraph()) + self.nx_graph = self.build_nx_graph(nx.DiGraph()) # pylint: disable=E1101 -class SubgraphMatrix (Subgraph, AlgebraMixin, NetAnalysisMixin): +class SubgraphMatrix (Subgraph, AlgebraMixin, NetAnalysisMixin): # pylint: disable=W0223 """ Projection of a RDF graph to a [*matrix*](https://mathworld.wolfram.com/AdjacencyMatrix.html) representation. Typical use cases include integration with non-RDF graph libraries for *graph algorithms*. @@ -173,7 +173,6 @@ class SubgraphMatrix (Subgraph, AlgebraMixin, NetAnalysisMixin): SPARQL query text needs to define a subgraph as: `subject -> object`. """ _SRC_DST_MAP: typing.List[str] = ["subject", "object"] - sparql: typing.Optional[str] = None def __init__ ( self, @@ -232,7 +231,7 @@ def build_df ( col_names: typing.List[str] = [ "src", "dst", "src_sym", "dst_sym" ] if self.sparql is None and self.kg.use_gpus is True: - raise ValueError("""To use GPUs is necessary to provide a SPARQL query to define a subgraph: + raise ValueError("""To use GPUs is necessary to provide a SPARQL query to define a subgraph: `kglab.SubgraphMatrix(kg, sparql)` or `SubgraphTensor(...)`""") row_iter = self.kg.query(self.sparql, bindings=self.bindings) @@ -337,13 +336,14 @@ def build_ig_graph ( ig_graph.vs["label"] = ig_graph.vs["name"] # pylint: disable=E1136,E1137 return ig_graph - + def _get_n_nodes(self): + """ Return number of nodes counted from the adjancency matrix""" return self.to_adjacency().shape[0] - + def _get_n_edges(self): + """ Return number of edges counted from the adjancency matrix""" return int(np.sum(self.to_adjacency())) - class SubgraphTensor (Subgraph): @@ -454,14 +454,14 @@ def as_tensor ( ## to extend or create an analyst's account-specific network ## model. - def pyvis_style_node ( + def pyvis_style_node ( # pylint: disable=R0201 self, pyvis_graph: pyvis.network.Network, node_id: int, label: str, *, style: dict = None, - ) -> None : + ) -> None : # pylint: disable=R0201 """ Adds a node into a [PyVis](https://pyvis.readthedocs.io/) network, optionally with styling info. diff --git a/kglab/util.py b/kglab/util.py index ebe9fad..49af283 100644 --- a/kglab/util.py +++ b/kglab/util.py @@ -11,6 +11,9 @@ import numpy as np # type: ignore # pylint: disable=E0401 import pandas as pd # type: ignore # pylint: disable=E0401 +import typing +from kglab.pkg_types import GraphLike + GPU_COUNT: int = 0 @@ -116,3 +119,19 @@ def root_mean_square ( s = sum(map(lambda x: float(x)**2.0, values)) n = float(len(values)) return math.sqrt(s / n) + +class Mixin: + """Base mixin, Provide `mypy` stubs for common methods and properties""" + _g: typing.Optional[GraphLike] + get_ns: typing.Callable + add_ns: typing.Callable + _ns: typing.Dict + add: typing.Callable + base_uri: typing.Optional[str] + parse: typing.Callable + get_context: typing.Callable + use_gpus: bool + serialize: typing.Callable + build_blank_graph: typing.Callable + graph_factory: typing.Callable + remove: typing.Callable