diff --git a/config/defaults.py b/config/defaults.py
index 597f09f..2fdfe07 100644
--- a/config/defaults.py
+++ b/config/defaults.py
@@ -78,6 +78,14 @@
MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k
MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations
MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration)
+MUTPB_DN = 0.05 # prob to try a deep and narrow paths mutation
+MUTPB_DN_MIN_LEN = 2 # minimum length of the deep and narrow paths
+MUTPB_DN_MAX_LEN = 10 # max of path length if not stopped by term_pb
+MUTPB_DN_TERM_PB = 0.7 # prob to terminate expansion each step > min_len
+MUTPB_DN_MAX_NODE_COUNT = 10 # edge fixations may have <= nodes
+MUTPB_DN_MIN_EDGE_COUNT = 2 # edges need to be valid for >= GTPs
+MUTPB_DN_QUERY_LIMIT = 32 # SPARQL query limit for top edge fixations
+MUTPB_DN_REC_RETRIES = 3 # retrial attempts in each recursion, WARNING: EXP!
# for import in helpers and __init__
__all__ = [_v for _v in globals().keys() if _v.isupper()]
diff --git a/gp_learner.py b/gp_learner.py
index 2f453b0..d463cd0 100644
--- a/gp_learner.py
+++ b/gp_learner.py
@@ -48,6 +48,7 @@
from gp_query import predict_query
from gp_query import query_time_hard_exceeded
from gp_query import query_time_soft_exceeded
+from gp_query import dnp_query
from gp_query import variable_substitution_query
from graph_pattern import canonicalize
from graph_pattern import gen_random_var
@@ -653,6 +654,137 @@ def mutate_fix_var(
return res
+def _mutate_deep_narrow_path_helper(
+ sparql, timeout, gtp_scores, child, edge_var, node_var,
+ gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N,
+ max_node_count=config.MUTPB_DN_MAX_NODE_COUNT,
+ min_edge_count=config.MUTPB_DN_MIN_EDGE_COUNT,
+ limit=config.MUTPB_DN_QUERY_LIMIT,
+ sample_n=config.MUTPB_FV_SAMPLE_MAXN,
+):
+ assert isinstance(child, GraphPattern)
+ assert isinstance(gtp_scores, GTPScores)
+
+ # The further we get, the less gtps are remaining. Sampling too many (all)
+ # of them might hurt as common substitutions (> limit ones) which are dead
+ # ends could cover less common ones that could actually help
+ gtp_sample_n = min(gtp_sample_n, int(gtp_scores.remaining_gain))
+ gtp_sample_n = random.randint(1, gtp_sample_n)
+
+ ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps(
+ max_n=gtp_sample_n)
+ t, substitution_counts = dnp_query(
+ sparql, timeout, child, ground_truth_pairs,
+ edge_var=edge_var,
+ node_var=node_var,
+ max_node_count=max_node_count,
+ min_edge_count=min_edge_count,
+ limit=limit,
+ )
+ edge_count, node_sum_count = substitution_counts
+ if not node_sum_count:
+ # the current pattern is unfit, as we can't find anything fulfilling it
+ logger.debug("tried to fix a var %s without result:\n%s"
+ "seems as if the pattern can't be fulfilled!",
+ edge_var, child.to_sparql_select_query())
+ fixed = False
+ return child, fixed
+ mutate_fix_var_filter(node_sum_count)
+ mutate_fix_var_filter(edge_count)
+ if not node_sum_count:
+ # could have happened that we removed the only possible substitution
+ fixed = False
+ return child, fixed
+
+ prio = Counter()
+ for edge, node_sum in node_sum_count.items():
+ ec = edge_count[edge]
+ prio[edge] = ec / (node_sum / ec) # ec / AVG degree
+ # randomly pick n of the substitutions with a prob ~ to their prios
+ edges, prios = zip(*prio.most_common())
+
+ substs = sample_from_list(edges, prios, sample_n)
+ logger.info(
+ 'fixed variable %s to %s in %s\n %s\n<%d out of:\n%s\n',
+ edge_var.n3(),
+ substs[0] if substs else '',
+ child,
+ '\n '.join([subst.n3() for subst in substs]),
+ sample_n,
+ '\n'.join([
+ ' %.3f: %s' % (c, v.n3()) for v, c in prio.most_common()]),
+ )
+ fixed = True
+ children = [
+ GraphPattern(child, mapping={edge_var: subst})
+ for subst in substs
+ ]
+ children = [
+ c for c in children if fit_to_live(c)
+ ]
+ if children:
+ child = children[0]
+ return child, fixed
+
+
+def mutate_deep_narrow_path(
+ child, sparql, timeout, gtp_scores,
+ _rec_depth=0,
+ start_node=None,
+ target_nodes = None,
+ min_len=config.MUTPB_DN_MIN_LEN,
+ max_len=config.MUTPB_DN_MAX_LEN,
+ term_pb=config.MUTPB_DN_TERM_PB,
+ pb_en_out_link=config.MUTPB_EN_OUT_LINK,
+ retries=config.MUTPB_DN_REC_RETRIES,
+):
+ assert isinstance(child, GraphPattern)
+ assert min_len > 0
+ if _rec_depth > max_len:
+ return None
+ if _rec_depth >= min_len and random.random() < term_pb:
+ return None
+ if _rec_depth == 0:
+ nodes = child.nodes
+ if not start_node:
+ start_node = random.choice(list(nodes))
+ target_nodes = list(nodes - {start_node})
+ if _rec_depth >=min_len:
+ closed_gp = child
+ for node in target_nodes:
+ var_edge_to_target = gen_random_var()
+ if random.random() < pb_en_out_link:
+ new_triple = (start_node, var_edge_to_target, node)
+ else:
+ new_triple = (node, var_edge_to_target, start_node)
+ closed_gp += [new_triple]
+ closed_gp, fixed_edge_to_target = _mutate_deep_narrow_path_helper(
+ sparql, timeout, gtp_scores, closed_gp,var_edge_to_target, node)
+ if fixed_edge_to_target:
+ return closed_gp
+
+ gp = child
+ new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node)
+ gp += [new_triple]
+ for r in range(retries):
+ fixed_gp, fixed = _mutate_deep_narrow_path_helper(
+ sparql, timeout, gtp_scores, gp, var_edge, var_node)
+ rec_gp = mutate_deep_narrow_path(
+ fixed_gp, sparql, timeout, gtp_scores,
+ _rec_depth+1,
+ start_node=var_node,
+ target_nodes = target_nodes,
+ )
+ if rec_gp:
+ return rec_gp
+ if fixed:
+ if _rec_depth > min_len:
+ return fixed_gp
+ if _rec_depth == 0:
+ return child
+ return None
+
+
def mutate_simplify_pattern(gp):
if len(gp) < 2:
return gp
@@ -757,6 +889,7 @@ def mutate(
pb_dt=config.MUTPB_DT,
pb_en=config.MUTPB_EN,
pb_fv=config.MUTPB_FV,
+ pb_dn=config.MUTPB_DN,
pb_id=config.MUTPB_ID,
pb_iv=config.MUTPB_IV,
pb_mv=config.MUTPB_MV,
@@ -796,15 +929,15 @@ def mutate(
if random.random() < pb_sp:
child = mutate_simplify_pattern(child)
+ if random.random() < pb_dn:
+ child = mutate_deep_narrow_path(child, sparql, timeout, gtp_scores)
+
if random.random() < pb_fv:
child = canonicalize(child)
children = mutate_fix_var(sparql, timeout, gtp_scores, child)
else:
children = [child]
-
- # TODO: deep & narrow paths mutation
-
children = {
c if fit_to_live(c) else orig_child
for c in children
diff --git a/gp_query.py b/gp_query.py
index 1bdd691..fd44fd4 100644
--- a/gp_query.py
+++ b/gp_query.py
@@ -32,6 +32,8 @@
from graph_pattern import TARGET_VAR
from graph_pattern import ASK_VAR
from graph_pattern import COUNT_VAR
+from graph_pattern import NODE_VAR_SUM
+from graph_pattern import EDGE_VAR_COUNT
from utils import exception_stack_catcher
from utils import sparql_json_result_bindings_to_rdflib
from utils import timer
@@ -279,7 +281,6 @@ def _combined_chunk_res(q_res, _vars, _ret_val_mapping):
return chunk_res
-
def count_query(sparql, timeout, graph_pattern, source=None,
**kwds):
assert isinstance(graph_pattern, GraphPattern)
@@ -457,6 +458,78 @@ def _var_subst_res_update(res, update, **_):
res += update
+def dnp_query(
+ sparql, timeout, graph_pattern, source_target_pairs,
+ edge_var, node_var, max_node_count, min_edge_count, limit,
+ batch_size=config.BATCH_SIZE
+):
+ _vars, _values, _ret_val_mapping = _get_vars_values_mapping(
+ graph_pattern, source_target_pairs)
+ return _multi_query(
+ sparql, timeout, graph_pattern, source_target_pairs, batch_size,
+ _vars, _values, _ret_val_mapping,
+ _dnp_res_init, _dnp_chunk_q,
+ _dnp_chunk_result_ext,
+ _res_update=_dnp_res_update,
+ edge_var=edge_var,
+ node_var=node_var,
+ max_node_count=max_node_count,
+ min_edge_count=min_edge_count,
+ limit=limit,
+ # non standard, passed via **kwds, see handling below
+ )
+
+
+# noinspection PyUnusedLocal
+def _dnp_res_init(_, **kwds):
+ return Counter(), Counter()
+
+
+def _dnp_chunk_q(
+ gp, _vars, values_chunk,
+ edge_var, node_var, max_node_count, min_edge_count, limit,
+ **_
+):
+ return gp.to_deep_narrow_path_query(
+ edge_var=edge_var,
+ node_var=node_var,
+ vars_=_vars,
+ values={_vars: values_chunk},
+ max_node_count=max_node_count,
+ min_edge_count=min_edge_count,
+ limit=limit,
+ )
+
+
+# noinspection PyUnusedLocal
+def _dnp_chunk_result_ext(
+ q_res, _vars, _,
+ edge_var,
+ **kwds
+):
+ chunk_edge_count, chunk_node_sum = Counter(), Counter()
+ res_rows_path = ['results', 'bindings']
+ bindings = sparql_json_result_bindings_to_rdflib(
+ get_path(q_res, res_rows_path, default=[])
+ )
+
+ for row in bindings:
+ row_res = get_path(row, [edge_var])
+ edge_count = int(get_path(row, [EDGE_VAR_COUNT], '0'))
+ chunk_edge_count[row_res] += edge_count
+ node_sum_count = int(get_path(row, [NODE_VAR_SUM], '0'))
+ chunk_node_sum[row_res] += node_sum_count
+ return chunk_edge_count, chunk_node_sum,
+
+
+def _dnp_res_update(res, up, **_):
+ edge_count, node_sum_count = res
+ if up:
+ chunk_edge_count, chunk_node_sum = up
+ edge_count.update(chunk_edge_count)
+ node_sum_count.update(chunk_node_sum)
+
+
def generate_stps_from_gp(sparql, gp):
"""Generates a list of source target pairs from a given graph pattern.
diff --git a/graph_pattern.py b/graph_pattern.py
index ddcb6f1..0c7833e 100644
--- a/graph_pattern.py
+++ b/graph_pattern.py
@@ -16,6 +16,7 @@
import logging
import random
import string
+import textwrap
import deap
import deap.base
@@ -41,6 +42,8 @@
TARGET_VAR = Variable('target')
ASK_VAR = Variable('ask')
COUNT_VAR = Variable('count')
+EDGE_VAR_COUNT = Variable('edge_var_count')
+NODE_VAR_SUM = Variable('node_var_sum')
def gen_random_var():
@@ -714,6 +717,103 @@ def to_count_var_over_values_query(self, var, vars_, values, limit):
res += 'LIMIT %d\n' % limit
return self._sparql_prefix(res)
+ def to_deep_narrow_path_query(
+ self, edge_var, node_var, vars_, values,
+ limit, max_node_count, min_edge_count,
+ ):
+ """Counts possible substitutions for edge_var to get a narrow path.
+
+ Meant to perform a query like this:
+ PREFIX dbr:
+ SELECT * WHERE {
+ {
+ SELECT ?edge_var
+ (SUM(?node_var_count) AS ?node_var_sum)
+ (COUNT(?source && ?target) AS ?edge_var_count)
+ (MAX(?node_var_count) AS ?max_node_count)
+ WHERE {
+ SELECT DISTINCT ?source ?target ?edge_var
+ (COUNT(?node_var) AS ?node_var_count)
+ WHERE {
+ VALUES (?source ?target) {
+ (dbr:Barrel dbr:Wine)
+ (dbr:Barrister dbr:Law)
+ (dbr:Beak dbr:Bird)
+ (dbr:Blanket dbr:Bed)
+ }
+ ?node_var ?edge_var ?source .
+ ?source ?target .
+ }
+ }
+ GROUP BY ?edge_var
+ }
+ FILTER(?max_node_count <= 10
+ && ?edge_var_count >= 2)
+ }
+ ORDER BY DESC(?edge_var_count) ASC(?node_var_sum)
+ LIMIT 32
+
+ The idea here is to expand a random node (?source in the example above)
+ with new variable triple and then try to fix its edge in a way that the
+ degree (?node_var_count) isn't too high (<= max_node_count). We're also
+ interested in the avg degree being low. In light of query chunking the
+ sum is returned here (instead of AVG).
+
+ Apart from minimizing the degrees, we would also like to maximize the
+ number of stps an ?edge_var fixation is valid for (?edge_var_count).
+
+ See gp_learner.mutate_deep_narrow_path() for more.
+
+ :param edge_var: Edge variable to find substitution for.
+ :param node_var: Node variable to count.
+ :param vars_: List of vars to fix values for (e.g. ?source, ?target).
+ :param values: List of value lists for vars_.
+ :param max_node_count: Filter on node count of edge variable.
+ :param min_edge_count: Filter for edge count of triples.
+ :param limit : limit result size.
+ :return: Query String.
+ """
+
+ res = '''\
+ SELECT * WHERE {
+ {
+ SELECT %(edge_var)s
+ (SUM(?node_var_count) AS %(node_var_sum)s)
+ (COUNT(%(vars_and)s) AS %(edge_var_count)s)
+ (MAX(?node_var_count) AS ?max_node_count)
+ WHERE {
+ SELECT DISTINCT %(vars)s %(edge_var)s
+ (COUNT(%(node_var)s) AS ?node_var_count)
+ WHERE {\n%(values_part)s %(triples)s
+ }
+ }
+ GROUP BY %(edge_var)s
+ }
+ FILTER(?max_node_count <= %(max_node_count)d
+ && %(edge_var_count)s >= %(min_edge_count)d)
+ }
+ ORDER BY DESC(%(edge_var_count)s) ASC(%(node_var_sum)s)
+ LIMIT %(limit)d
+ ''' % {
+ # TODO: adapt self._sparql_values_part for template use (indent)
+ 'edge_var': edge_var.n3(),
+ 'node_var_sum': NODE_VAR_SUM.n3(),
+ 'vars_and': ' && '.join([v.n3() for v in vars_]),
+ 'edge_var_count': EDGE_VAR_COUNT.n3(),
+ 'vars': ' '.join([v.n3() for v in vars_]),
+ 'node_var': node_var.n3(),
+ 'values_part': self._sparql_values_part(
+ values, indent=' '),
+ 'triples': '\n '.join(
+ '%s %s %s .' % (s.n3(), p.n3(), o.n3()) for s, p, o in self
+ ),
+ 'limit': limit,
+ 'max_node_count': max_node_count,
+ 'min_edge_count': min_edge_count,
+ }
+ res = textwrap.dedent(res)
+ return self._sparql_prefix(res)
+
def to_dict(self):
return {
'fitness': self.fitness.values if self.fitness.valid else (),
diff --git a/tests/test_gp_learner_offline.py b/tests/test_gp_learner_offline.py
index cf01f5f..3f948d7 100644
--- a/tests/test_gp_learner_offline.py
+++ b/tests/test_gp_learner_offline.py
@@ -4,6 +4,7 @@
from collections import Counter
import logging
import random
+import textwrap
import rdflib
from rdflib import URIRef
@@ -108,6 +109,38 @@ def test_mutate_merge_var():
assert False, "merge never reached one of the cases: %s" % cases
+def test_deep_narrow_path_query():
+ node_var = Variable('node_var')
+ edge_var = Variable('edge_var')
+ gtps = [
+ (dbp['Barrel'], dbp['Wine']),
+ (dbp['Barrister'], dbp['Law']),
+ (dbp['Beak'], dbp['Bird']),
+ (dbp['Blanket'], dbp['Bed']),
+ ]
+
+ gp = GraphPattern([
+ (node_var, edge_var, SOURCE_VAR),
+ (SOURCE_VAR, wikilink, TARGET_VAR)
+ ])
+
+ vars_ = (SOURCE_VAR, TARGET_VAR)
+ res = gp.to_deep_narrow_path_query(
+ edge_var, node_var, vars_, {vars_: gtps},
+ limit=32,
+ max_node_count=10,
+ min_edge_count=2,
+ ).strip()
+ doc = gp.to_deep_narrow_path_query.__doc__
+ doc_str_example_query = "\n".join([
+ l for l in doc.splitlines()
+ if l.startswith(' ')
+ ])
+ doc_str_example_query = textwrap.dedent(doc_str_example_query)
+ assert res == doc_str_example_query, \
+ "res:\n%s\n\ndoes not look like:\n\n%s" % (res, doc_str_example_query)
+
+
def test_simplify_pattern():
gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)])
res = mutate_simplify_pattern(gp)
@@ -270,3 +303,7 @@ def test_remaining_gain_sample_gtps():
def test_gtp_scores():
assert gtp_scores - gtp_scores == 0
+
+
+if __name__ == '__main__':
+ test_deep_narrow_path_query()
diff --git a/tests/test_gp_learner_online.py b/tests/test_gp_learner_online.py
index 54c2039..85742f6 100644
--- a/tests/test_gp_learner_online.py
+++ b/tests/test_gp_learner_online.py
@@ -15,6 +15,7 @@
from config import SPARQL_ENDPOINT
from gp_learner import evaluate
+from gp_learner import mutate_deep_narrow_path
from gp_learner import mutate_fix_var
from gp_learner import update_individuals
from gp_query import calibrate_query_timeout
@@ -134,6 +135,17 @@ def test_mutate_fix_var():
assert gp.vars_in_graph - tgp.vars_in_graph
+def test_mutate_deep_narrow_path():
+ p = Variable('p')
+ gp = GraphPattern([
+ (SOURCE_VAR, p, TARGET_VAR)
+ ])
+ child = mutate_deep_narrow_path(gp, sparql, timeout, gtp_scores)
+ assert gp == child or len(child) > len(gp)
+ print(gp)
+ print(child)
+
+
def test_timeout_pattern():
u = URIRef('http://dbpedia.org/resource/Template:Reflist')
wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates')
@@ -158,3 +170,7 @@ def test_timeout_pattern():
assert fitness.f_measure == 0
else:
assert fitness.f_measure > 0
+
+
+if __name__ == '__main__':
+ test_mutate_deep_narrow_path()