diff --git a/config/defaults.py b/config/defaults.py index 597f09f..2fdfe07 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -78,6 +78,14 @@ MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) +MUTPB_DN = 0.05 # prob to try a deep and narrow paths mutation +MUTPB_DN_MIN_LEN = 2 # minimum length of the deep and narrow paths +MUTPB_DN_MAX_LEN = 10 # max of path length if not stopped by term_pb +MUTPB_DN_TERM_PB = 0.7 # prob to terminate expansion each step > min_len +MUTPB_DN_MAX_NODE_COUNT = 10 # edge fixations may have <= nodes +MUTPB_DN_MIN_EDGE_COUNT = 2 # edges need to be valid for >= GTPs +MUTPB_DN_QUERY_LIMIT = 32 # SPARQL query limit for top edge fixations +MUTPB_DN_REC_RETRIES = 3 # retrial attempts in each recursion, WARNING: EXP! # for import in helpers and __init__ __all__ = [_v for _v in globals().keys() if _v.isupper()] diff --git a/gp_learner.py b/gp_learner.py index 2f453b0..d463cd0 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -48,6 +48,7 @@ from gp_query import predict_query from gp_query import query_time_hard_exceeded from gp_query import query_time_soft_exceeded +from gp_query import dnp_query from gp_query import variable_substitution_query from graph_pattern import canonicalize from graph_pattern import gen_random_var @@ -653,6 +654,137 @@ def mutate_fix_var( return res +def _mutate_deep_narrow_path_helper( + sparql, timeout, gtp_scores, child, edge_var, node_var, + gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N, + max_node_count=config.MUTPB_DN_MAX_NODE_COUNT, + min_edge_count=config.MUTPB_DN_MIN_EDGE_COUNT, + limit=config.MUTPB_DN_QUERY_LIMIT, + sample_n=config.MUTPB_FV_SAMPLE_MAXN, +): + assert isinstance(child, GraphPattern) + assert isinstance(gtp_scores, GTPScores) + + # The further we get, the less gtps are remaining. Sampling too many (all) + # of them might hurt as common substitutions (> limit ones) which are dead + # ends could cover less common ones that could actually help + gtp_sample_n = min(gtp_sample_n, int(gtp_scores.remaining_gain)) + gtp_sample_n = random.randint(1, gtp_sample_n) + + ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps( + max_n=gtp_sample_n) + t, substitution_counts = dnp_query( + sparql, timeout, child, ground_truth_pairs, + edge_var=edge_var, + node_var=node_var, + max_node_count=max_node_count, + min_edge_count=min_edge_count, + limit=limit, + ) + edge_count, node_sum_count = substitution_counts + if not node_sum_count: + # the current pattern is unfit, as we can't find anything fulfilling it + logger.debug("tried to fix a var %s without result:\n%s" + "seems as if the pattern can't be fulfilled!", + edge_var, child.to_sparql_select_query()) + fixed = False + return child, fixed + mutate_fix_var_filter(node_sum_count) + mutate_fix_var_filter(edge_count) + if not node_sum_count: + # could have happened that we removed the only possible substitution + fixed = False + return child, fixed + + prio = Counter() + for edge, node_sum in node_sum_count.items(): + ec = edge_count[edge] + prio[edge] = ec / (node_sum / ec) # ec / AVG degree + # randomly pick n of the substitutions with a prob ~ to their prios + edges, prios = zip(*prio.most_common()) + + substs = sample_from_list(edges, prios, sample_n) + logger.info( + 'fixed variable %s to %s in %s\n %s\n<%d out of:\n%s\n', + edge_var.n3(), + substs[0] if substs else '', + child, + '\n '.join([subst.n3() for subst in substs]), + sample_n, + '\n'.join([ + ' %.3f: %s' % (c, v.n3()) for v, c in prio.most_common()]), + ) + fixed = True + children = [ + GraphPattern(child, mapping={edge_var: subst}) + for subst in substs + ] + children = [ + c for c in children if fit_to_live(c) + ] + if children: + child = children[0] + return child, fixed + + +def mutate_deep_narrow_path( + child, sparql, timeout, gtp_scores, + _rec_depth=0, + start_node=None, + target_nodes = None, + min_len=config.MUTPB_DN_MIN_LEN, + max_len=config.MUTPB_DN_MAX_LEN, + term_pb=config.MUTPB_DN_TERM_PB, + pb_en_out_link=config.MUTPB_EN_OUT_LINK, + retries=config.MUTPB_DN_REC_RETRIES, +): + assert isinstance(child, GraphPattern) + assert min_len > 0 + if _rec_depth > max_len: + return None + if _rec_depth >= min_len and random.random() < term_pb: + return None + if _rec_depth == 0: + nodes = child.nodes + if not start_node: + start_node = random.choice(list(nodes)) + target_nodes = list(nodes - {start_node}) + if _rec_depth >=min_len: + closed_gp = child + for node in target_nodes: + var_edge_to_target = gen_random_var() + if random.random() < pb_en_out_link: + new_triple = (start_node, var_edge_to_target, node) + else: + new_triple = (node, var_edge_to_target, start_node) + closed_gp += [new_triple] + closed_gp, fixed_edge_to_target = _mutate_deep_narrow_path_helper( + sparql, timeout, gtp_scores, closed_gp,var_edge_to_target, node) + if fixed_edge_to_target: + return closed_gp + + gp = child + new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) + gp += [new_triple] + for r in range(retries): + fixed_gp, fixed = _mutate_deep_narrow_path_helper( + sparql, timeout, gtp_scores, gp, var_edge, var_node) + rec_gp = mutate_deep_narrow_path( + fixed_gp, sparql, timeout, gtp_scores, + _rec_depth+1, + start_node=var_node, + target_nodes = target_nodes, + ) + if rec_gp: + return rec_gp + if fixed: + if _rec_depth > min_len: + return fixed_gp + if _rec_depth == 0: + return child + return None + + def mutate_simplify_pattern(gp): if len(gp) < 2: return gp @@ -757,6 +889,7 @@ def mutate( pb_dt=config.MUTPB_DT, pb_en=config.MUTPB_EN, pb_fv=config.MUTPB_FV, + pb_dn=config.MUTPB_DN, pb_id=config.MUTPB_ID, pb_iv=config.MUTPB_IV, pb_mv=config.MUTPB_MV, @@ -796,15 +929,15 @@ def mutate( if random.random() < pb_sp: child = mutate_simplify_pattern(child) + if random.random() < pb_dn: + child = mutate_deep_narrow_path(child, sparql, timeout, gtp_scores) + if random.random() < pb_fv: child = canonicalize(child) children = mutate_fix_var(sparql, timeout, gtp_scores, child) else: children = [child] - - # TODO: deep & narrow paths mutation - children = { c if fit_to_live(c) else orig_child for c in children diff --git a/gp_query.py b/gp_query.py index 1bdd691..fd44fd4 100644 --- a/gp_query.py +++ b/gp_query.py @@ -32,6 +32,8 @@ from graph_pattern import TARGET_VAR from graph_pattern import ASK_VAR from graph_pattern import COUNT_VAR +from graph_pattern import NODE_VAR_SUM +from graph_pattern import EDGE_VAR_COUNT from utils import exception_stack_catcher from utils import sparql_json_result_bindings_to_rdflib from utils import timer @@ -279,7 +281,6 @@ def _combined_chunk_res(q_res, _vars, _ret_val_mapping): return chunk_res - def count_query(sparql, timeout, graph_pattern, source=None, **kwds): assert isinstance(graph_pattern, GraphPattern) @@ -457,6 +458,78 @@ def _var_subst_res_update(res, update, **_): res += update +def dnp_query( + sparql, timeout, graph_pattern, source_target_pairs, + edge_var, node_var, max_node_count, min_edge_count, limit, + batch_size=config.BATCH_SIZE +): + _vars, _values, _ret_val_mapping = _get_vars_values_mapping( + graph_pattern, source_target_pairs) + return _multi_query( + sparql, timeout, graph_pattern, source_target_pairs, batch_size, + _vars, _values, _ret_val_mapping, + _dnp_res_init, _dnp_chunk_q, + _dnp_chunk_result_ext, + _res_update=_dnp_res_update, + edge_var=edge_var, + node_var=node_var, + max_node_count=max_node_count, + min_edge_count=min_edge_count, + limit=limit, + # non standard, passed via **kwds, see handling below + ) + + +# noinspection PyUnusedLocal +def _dnp_res_init(_, **kwds): + return Counter(), Counter() + + +def _dnp_chunk_q( + gp, _vars, values_chunk, + edge_var, node_var, max_node_count, min_edge_count, limit, + **_ +): + return gp.to_deep_narrow_path_query( + edge_var=edge_var, + node_var=node_var, + vars_=_vars, + values={_vars: values_chunk}, + max_node_count=max_node_count, + min_edge_count=min_edge_count, + limit=limit, + ) + + +# noinspection PyUnusedLocal +def _dnp_chunk_result_ext( + q_res, _vars, _, + edge_var, + **kwds +): + chunk_edge_count, chunk_node_sum = Counter(), Counter() + res_rows_path = ['results', 'bindings'] + bindings = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + + for row in bindings: + row_res = get_path(row, [edge_var]) + edge_count = int(get_path(row, [EDGE_VAR_COUNT], '0')) + chunk_edge_count[row_res] += edge_count + node_sum_count = int(get_path(row, [NODE_VAR_SUM], '0')) + chunk_node_sum[row_res] += node_sum_count + return chunk_edge_count, chunk_node_sum, + + +def _dnp_res_update(res, up, **_): + edge_count, node_sum_count = res + if up: + chunk_edge_count, chunk_node_sum = up + edge_count.update(chunk_edge_count) + node_sum_count.update(chunk_node_sum) + + def generate_stps_from_gp(sparql, gp): """Generates a list of source target pairs from a given graph pattern. diff --git a/graph_pattern.py b/graph_pattern.py index ddcb6f1..0c7833e 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -16,6 +16,7 @@ import logging import random import string +import textwrap import deap import deap.base @@ -41,6 +42,8 @@ TARGET_VAR = Variable('target') ASK_VAR = Variable('ask') COUNT_VAR = Variable('count') +EDGE_VAR_COUNT = Variable('edge_var_count') +NODE_VAR_SUM = Variable('node_var_sum') def gen_random_var(): @@ -714,6 +717,103 @@ def to_count_var_over_values_query(self, var, vars_, values, limit): res += 'LIMIT %d\n' % limit return self._sparql_prefix(res) + def to_deep_narrow_path_query( + self, edge_var, node_var, vars_, values, + limit, max_node_count, min_edge_count, + ): + """Counts possible substitutions for edge_var to get a narrow path. + + Meant to perform a query like this: + PREFIX dbr: + SELECT * WHERE { + { + SELECT ?edge_var + (SUM(?node_var_count) AS ?node_var_sum) + (COUNT(?source && ?target) AS ?edge_var_count) + (MAX(?node_var_count) AS ?max_node_count) + WHERE { + SELECT DISTINCT ?source ?target ?edge_var + (COUNT(?node_var) AS ?node_var_count) + WHERE { + VALUES (?source ?target) { + (dbr:Barrel dbr:Wine) + (dbr:Barrister dbr:Law) + (dbr:Beak dbr:Bird) + (dbr:Blanket dbr:Bed) + } + ?node_var ?edge_var ?source . + ?source ?target . + } + } + GROUP BY ?edge_var + } + FILTER(?max_node_count <= 10 + && ?edge_var_count >= 2) + } + ORDER BY DESC(?edge_var_count) ASC(?node_var_sum) + LIMIT 32 + + The idea here is to expand a random node (?source in the example above) + with new variable triple and then try to fix its edge in a way that the + degree (?node_var_count) isn't too high (<= max_node_count). We're also + interested in the avg degree being low. In light of query chunking the + sum is returned here (instead of AVG). + + Apart from minimizing the degrees, we would also like to maximize the + number of stps an ?edge_var fixation is valid for (?edge_var_count). + + See gp_learner.mutate_deep_narrow_path() for more. + + :param edge_var: Edge variable to find substitution for. + :param node_var: Node variable to count. + :param vars_: List of vars to fix values for (e.g. ?source, ?target). + :param values: List of value lists for vars_. + :param max_node_count: Filter on node count of edge variable. + :param min_edge_count: Filter for edge count of triples. + :param limit : limit result size. + :return: Query String. + """ + + res = '''\ + SELECT * WHERE { + { + SELECT %(edge_var)s + (SUM(?node_var_count) AS %(node_var_sum)s) + (COUNT(%(vars_and)s) AS %(edge_var_count)s) + (MAX(?node_var_count) AS ?max_node_count) + WHERE { + SELECT DISTINCT %(vars)s %(edge_var)s + (COUNT(%(node_var)s) AS ?node_var_count) + WHERE {\n%(values_part)s %(triples)s + } + } + GROUP BY %(edge_var)s + } + FILTER(?max_node_count <= %(max_node_count)d + && %(edge_var_count)s >= %(min_edge_count)d) + } + ORDER BY DESC(%(edge_var_count)s) ASC(%(node_var_sum)s) + LIMIT %(limit)d + ''' % { + # TODO: adapt self._sparql_values_part for template use (indent) + 'edge_var': edge_var.n3(), + 'node_var_sum': NODE_VAR_SUM.n3(), + 'vars_and': ' && '.join([v.n3() for v in vars_]), + 'edge_var_count': EDGE_VAR_COUNT.n3(), + 'vars': ' '.join([v.n3() for v in vars_]), + 'node_var': node_var.n3(), + 'values_part': self._sparql_values_part( + values, indent=' '), + 'triples': '\n '.join( + '%s %s %s .' % (s.n3(), p.n3(), o.n3()) for s, p, o in self + ), + 'limit': limit, + 'max_node_count': max_node_count, + 'min_edge_count': min_edge_count, + } + res = textwrap.dedent(res) + return self._sparql_prefix(res) + def to_dict(self): return { 'fitness': self.fitness.values if self.fitness.valid else (), diff --git a/tests/test_gp_learner_offline.py b/tests/test_gp_learner_offline.py index cf01f5f..3f948d7 100644 --- a/tests/test_gp_learner_offline.py +++ b/tests/test_gp_learner_offline.py @@ -4,6 +4,7 @@ from collections import Counter import logging import random +import textwrap import rdflib from rdflib import URIRef @@ -108,6 +109,38 @@ def test_mutate_merge_var(): assert False, "merge never reached one of the cases: %s" % cases +def test_deep_narrow_path_query(): + node_var = Variable('node_var') + edge_var = Variable('edge_var') + gtps = [ + (dbp['Barrel'], dbp['Wine']), + (dbp['Barrister'], dbp['Law']), + (dbp['Beak'], dbp['Bird']), + (dbp['Blanket'], dbp['Bed']), + ] + + gp = GraphPattern([ + (node_var, edge_var, SOURCE_VAR), + (SOURCE_VAR, wikilink, TARGET_VAR) + ]) + + vars_ = (SOURCE_VAR, TARGET_VAR) + res = gp.to_deep_narrow_path_query( + edge_var, node_var, vars_, {vars_: gtps}, + limit=32, + max_node_count=10, + min_edge_count=2, + ).strip() + doc = gp.to_deep_narrow_path_query.__doc__ + doc_str_example_query = "\n".join([ + l for l in doc.splitlines() + if l.startswith(' ') + ]) + doc_str_example_query = textwrap.dedent(doc_str_example_query) + assert res == doc_str_example_query, \ + "res:\n%s\n\ndoes not look like:\n\n%s" % (res, doc_str_example_query) + + def test_simplify_pattern(): gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)]) res = mutate_simplify_pattern(gp) @@ -270,3 +303,7 @@ def test_remaining_gain_sample_gtps(): def test_gtp_scores(): assert gtp_scores - gtp_scores == 0 + + +if __name__ == '__main__': + test_deep_narrow_path_query() diff --git a/tests/test_gp_learner_online.py b/tests/test_gp_learner_online.py index 54c2039..85742f6 100644 --- a/tests/test_gp_learner_online.py +++ b/tests/test_gp_learner_online.py @@ -15,6 +15,7 @@ from config import SPARQL_ENDPOINT from gp_learner import evaluate +from gp_learner import mutate_deep_narrow_path from gp_learner import mutate_fix_var from gp_learner import update_individuals from gp_query import calibrate_query_timeout @@ -134,6 +135,17 @@ def test_mutate_fix_var(): assert gp.vars_in_graph - tgp.vars_in_graph +def test_mutate_deep_narrow_path(): + p = Variable('p') + gp = GraphPattern([ + (SOURCE_VAR, p, TARGET_VAR) + ]) + child = mutate_deep_narrow_path(gp, sparql, timeout, gtp_scores) + assert gp == child or len(child) > len(gp) + print(gp) + print(child) + + def test_timeout_pattern(): u = URIRef('http://dbpedia.org/resource/Template:Reflist') wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates') @@ -158,3 +170,7 @@ def test_timeout_pattern(): assert fitness.f_measure == 0 else: assert fitness.f_measure > 0 + + +if __name__ == '__main__': + test_mutate_deep_narrow_path()