From 1fd4b978ad533cee87f4e274dc110e45b2fd063f Mon Sep 17 00:00:00 2001 From: Nandika Kalra Date: Thu, 16 Feb 2017 20:20:36 +0100 Subject: [PATCH 1/9] WIP: narrow deep paths mutation --- config/defaults.py | 7 ++ gp_learner.py | 107 ++++++++++++++++++++++++++++++- gp_query.py | 65 ++++++++++++++++++- graph_pattern.py | 83 ++++++++++++++++++++++++ tests/test_gp_learner_offline.py | 35 ++++++++++ 5 files changed, 293 insertions(+), 4 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index 597f09f..370067c 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -78,6 +78,13 @@ MUTPB_FV_SAMPLE_MAXN = 32 # max n of instantiations to sample from top k MUTPB_FV_QUERY_LIMIT = 256 # SPARQL query limit for the top k instantiations MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) +MUTPB_DN = 0.05 # prob to try a deep and narrow paths mutation +MUTPB_DN_MIN_LEN = 2 # minimum length of the deep and narrow paths +MUTPB_DN_MAX_LEN = 10 # absolute max of path length if not stopped by term_pb +MUTPB_DN_TERM_PB = 0.3 # prob to terminate node expansion each step > min_len +MUTPB_DN_FILTER_NODE_COUNT = 10 +MUTPB_DN_FILTER_EDGE_COUNT = 1 +MUTPB_DN_QUERY_LIMIT = 32 # for import in helpers and __init__ __all__ = [_v for _v in globals().keys() if _v.isupper()] diff --git a/gp_learner.py b/gp_learner.py index 2f453b0..68c6c25 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -48,6 +48,7 @@ from gp_query import predict_query from gp_query import query_time_hard_exceeded from gp_query import query_time_soft_exceeded +from gp_query import variable_substitution_deep_narrow_mut_query from gp_query import variable_substitution_query from graph_pattern import canonicalize from graph_pattern import gen_random_var @@ -653,6 +654,105 @@ def mutate_fix_var( return res +def _mutate_deep_narrow_path_helper( + sparql, + timeout, + gtp_scores, + child, + edge_var, + node_var, + gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N, + limit_res=config.MUTPB_DN_QUERY_LIMIT, + sample_n=config.MUTPB_FV_SAMPLE_MAXN, +): + assert isinstance(child, GraphPattern) + assert isinstance(gtp_scores, GTPScores) + + # The further we get, the less gtps are remaining. Sampling too many (all) + # of them might hurt as common substitutions (> limit ones) which are dead + # ends could cover less common ones that could actually help + gtp_sample_n = min(gtp_sample_n, int(gtp_scores.remaining_gain)) + gtp_sample_n = random.randint(1, gtp_sample_n) + + ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps( + n=gtp_sample_n) + t, substitution_counts = variable_substitution_deep_narrow_mut_query( + sparql, timeout, child, edge_var, node_var, ground_truth_pairs, + limit_res) + edge_count, node_sum_count = substitution_counts + if not node_sum_count: + # the current pattern is unfit, as we can't find anything fulfilling it + logger.debug("tried to fix a var %s without result:\n%s" + "seems as if the pattern can't be fulfilled!", + edge_var, child.to_sparql_select_query()) + fixed = False + return child, fixed + mutate_fix_var_filter(node_sum_count) + mutate_fix_var_filter(edge_count) + if not node_sum_count: + # could have happened that we removed the only possible substitution + fixed = False + return child, fixed + + prio = Counter() + for edge, node_sum in node_sum_count.items(): + ec = edge_count[edge] + prio[edge] = ec / (node_sum / ec) # ec / AVG degree + # randomly pick n of the substitutions with a prob ~ to their counts + edges, prios = zip(*prio.most_common()) + + substs = sample_from_list(edges, prios, sample_n) + + logger.info( + 'fixed variable %s in %sto:\n %s\n<%d out of:\n%s\n', + edge_var.n3(), + child, + '\n '.join([subst.n3() for subst in substs]), + sample_n, + '\n'.join([ + ' %.3f: %s' % (c, v.n3()) for v, c in prio.most_common()]), + ) + fixed = True + orig_child = child + children = [ + GraphPattern(child, mapping={edge_var: subst}) + for subst in substs + ] + children = [ + c if fit_to_live(c) else orig_child + for c in children + ] + if children: + child = random.choice(list(children)) + return child, fixed + + +def mutate_deep_narrow_path( + child, sparql, timeout, gtp_scores, + min_len=config.MUTPB_DN_MIN_LEN, + max_len=config.MUTPB_DN_MAX_LEN, + term_pb=config.MUTPB_DN_TERM_PB, +): + assert isinstance(child, GraphPattern) + nodes = list(child.nodes) + start_node = random.choice(nodes) + # target_nodes = set(nodes) - {start_node} + gp = child + hop = 0 + while True: + if hop >= min_len and random.random() < term_pb: + break + if hop >= max_len: + break + hop += 1 + new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) + gp += [new_triple] + gp, fixed = _mutate_deep_narrow_path_helper( + sparql, timeout, gtp_scores, gp, var_edge, var_node) + start_node = var_node + return gp + + def mutate_simplify_pattern(gp): if len(gp) < 2: return gp @@ -757,6 +857,7 @@ def mutate( pb_dt=config.MUTPB_DT, pb_en=config.MUTPB_EN, pb_fv=config.MUTPB_FV, + pb_dn=config.MUTPB_DN, pb_id=config.MUTPB_ID, pb_iv=config.MUTPB_IV, pb_mv=config.MUTPB_MV, @@ -796,15 +897,15 @@ def mutate( if random.random() < pb_sp: child = mutate_simplify_pattern(child) + if random.random() < pb_dn: + child = mutate_deep_narrow_path(child, sparql, timeout, gtp_scores) + if random.random() < pb_fv: child = canonicalize(child) children = mutate_fix_var(sparql, timeout, gtp_scores, child) else: children = [child] - - # TODO: deep & narrow paths mutation - children = { c if fit_to_live(c) else orig_child for c in children diff --git a/gp_query.py b/gp_query.py index 1bdd691..10f9003 100644 --- a/gp_query.py +++ b/gp_query.py @@ -32,6 +32,8 @@ from graph_pattern import TARGET_VAR from graph_pattern import ASK_VAR from graph_pattern import COUNT_VAR +from graph_pattern import NODE_VAR_SUM +from graph_pattern import EDGE_VAR_COUNT from utils import exception_stack_catcher from utils import sparql_json_result_bindings_to_rdflib from utils import timer @@ -279,7 +281,6 @@ def _combined_chunk_res(q_res, _vars, _ret_val_mapping): return chunk_res - def count_query(sparql, timeout, graph_pattern, source=None, **kwds): assert isinstance(graph_pattern, GraphPattern) @@ -457,6 +458,68 @@ def _var_subst_res_update(res, update, **_): res += update +def variable_substitution_deep_narrow_mut_query( + sparql, timeout, graph_pattern, edge_var, node_var, + source_target_pairs, limit_res, batch_size=config.BATCH_SIZE): + _vars, _values, _ret_val_mapping = _get_vars_values_mapping( + graph_pattern, source_target_pairs) + _edge_var_node_var_and_vars = (edge_var, node_var, _vars) + return _multi_query( + sparql, timeout, graph_pattern, source_target_pairs, batch_size, + _edge_var_node_var_and_vars, _values, _ret_val_mapping, + _var_subst_dnp_res_init, _var_subst_dnp_chunk_q, + _var_subst_dnp_chunk_result_ext, + _res_update=_var_subst_dnp_update, + limit=limit_res, + # non standard, passed via **kwds, see handling below + ) + + +# noinspection PyUnusedLocal +def _var_subst_dnp_res_init(_, **kwds): + return Counter(), Counter() + + +def _var_subst_dnp_chunk_q(gp, _edge_var_node_var_and_vars, + values_chunk, limit): + edge_var, node_var, _vars = _edge_var_node_var_and_vars + return gp.to_find_edge_var_for_narrow_path_query( + edge_var=edge_var, + node_var=node_var, + vars_=_vars, + values={_vars: values_chunk}, + limit_res=limit) + + +# noinspection PyUnusedLocal +def _var_subst_dnp_chunk_result_ext( + q_res, _edge_var_node_var_and_vars, _, **kwds): + edge_var, node_var, _vars = _edge_var_node_var_and_vars + chunk_edge_count, chunk_node_sum = Counter(), Counter() + res_rows_path = ['results', 'bindings'] + bindings = sparql_json_result_bindings_to_rdflib( + get_path(q_res, res_rows_path, default=[]) + ) + + for row in bindings: + row_res = get_path(row, [edge_var]) + edge_count = int(get_path(row, [EDGE_VAR_COUNT], '0')) + chunk_edge_count[row_res] += edge_count + node_sum_count = int(get_path(row, [NODE_VAR_SUM], '0')) + chunk_node_sum[row_res] += node_sum_count + return chunk_edge_count, chunk_node_sum, + + +def _var_subst_dnp_update(res, up, **_): + edge_count, node_sum_count = res + try: + chunk_edge_count, chunk_node_sum = up + edge_count.update(chunk_edge_count) + node_sum_count.update(chunk_node_sum) + except ValueError: + pass + + def generate_stps_from_gp(sparql, gp): """Generates a list of source target pairs from a given graph pattern. diff --git a/graph_pattern.py b/graph_pattern.py index ddcb6f1..9990f1e 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -31,6 +31,7 @@ import six from utils import URIShortener +import config logger = logging.getLogger(__name__) @@ -41,6 +42,8 @@ TARGET_VAR = Variable('target') ASK_VAR = Variable('ask') COUNT_VAR = Variable('count') +EDGE_VAR_COUNT = Variable('edge_var_count') +NODE_VAR_SUM = Variable('node_var_sum') def gen_random_var(): @@ -714,6 +717,86 @@ def to_count_var_over_values_query(self, var, vars_, values, limit): res += 'LIMIT %d\n' % limit return self._sparql_prefix(res) + def to_find_edge_var_for_narrow_path_query( + self, edge_var, node_var, vars_, values, limit_res, + filter_node_count=config.MUTPB_DN_FILTER_NODE_COUNT, + filter_edge_count=config.MUTPB_DN_FILTER_EDGE_COUNT, + ): + """Counts possible substitutions for edge_var to get a narrow path + + Meant to perform a query like this: + SELECT * + { + { + SELECT + ?edge_var + (COUNT(*) AS ?edge_var_count) + (MAX(?node_var_count) AS ?max_node_count) + (COUNT(*)/AVG(?node_var_count) as ?prio_var) + { + SELECT DISTINCT + ?source ?target ?edge_var (COUNT(?node_var) AS ?node_var_count) + { + VALUES (?source ?target) { + (dbr:Adolescence dbr:Youth) + (dbr:Adult dbr:Child) + (dbr:Angel dbr:Heaven) + (dbr:Arithmetic dbr:Mathematics) + } + ?node_var ?edge_var ?source . + ?source dbo:wikiPageWikiLink ?target . + } + } + GROUP BY ?edge_var + ORDER BY DESC(?edge_var_count) + } + FILTER(?max_node_count < 10 && ?edge_var_count > 1) + } + ORDER BY DESC(?prio_var) + LIMIT 32 + + :param edge_var: Edge variable to find substitution for. + :param node_var: Node variable to count. + :param vars_: List of vars to fix values for (e.g. ?source, ?target). + :param values: List of value lists for vars_. + :param filter_node_count: Filter on node count of edge variable. + :param filter_edge_count: Filter for edge count of triples. + :param limit_res : limit result size + :return: Query String. + """ + + res = 'SELECT * WHERE {\n' + res += ' {\n'\ + ' SELECT %s (SUM (?node_var_count) AS %s) (COUNT(%s) AS %s) ' \ + '(MAX(?node_var_count) AS ?max_node_count) WHERE {\n' % ( + edge_var.n3(), + NODE_VAR_SUM.n3(), + ' && '.join([v.n3() for v in vars_]), + EDGE_VAR_COUNT.n3(), ) + res += ' SELECT DISTINCT %s %s (COUNT(%s) AS ?node_var_count) ' \ + 'WHERE {\n ' % (' '.join([v.n3() for v in vars_]), + edge_var.n3(), node_var.n3(), ) + res += self._sparql_values_part(values) + + # triples part + tres = [] + for s, p, o in self: + tres.append('%s %s %s .' % (s.n3(), p.n3(), o.n3())) + indent = ' ' * 3 + triples = indent + ('\n' + indent).join(tres) + '\n' + res += triples + res += ' }\n'\ + ' }\n' + res += ' GROUP BY %s\n' % edge_var.n3() + res += ' }\n' + res += ' FILTER(?max_node_count < %d && %s > %d)\n' \ + % (filter_node_count, EDGE_VAR_COUNT.n3(), + filter_edge_count) + res += '}\n' + res += 'ORDER BY ASC(%s)\n' % NODE_VAR_SUM.n3() + res += 'LIMIT %d' % limit_res + return self._sparql_prefix(res) + def to_dict(self): return { 'fitness': self.fitness.values if self.fitness.valid else (), diff --git a/tests/test_gp_learner_offline.py b/tests/test_gp_learner_offline.py index cf01f5f..057be67 100644 --- a/tests/test_gp_learner_offline.py +++ b/tests/test_gp_learner_offline.py @@ -13,6 +13,7 @@ from gp_learner import mutate_increase_dist from gp_learner import mutate_merge_var from gp_learner import mutate_simplify_pattern +from gp_learner import mutate_deep_narrow_path from graph_pattern import GraphPattern from graph_pattern import SOURCE_VAR from graph_pattern import TARGET_VAR @@ -108,6 +109,35 @@ def test_mutate_merge_var(): assert False, "merge never reached one of the cases: %s" % cases +def test_mutate_deep_narrow_path(): + p = Variable('p') + gp = GraphPattern([ + (SOURCE_VAR, p, TARGET_VAR) + ]) + child = mutate_deep_narrow_path(gp) + assert gp == child or len(child) > len(gp) + print(gp) + print(child) + + +def test_to_find_edge_var_for_narrow_path_query(): + node_var = Variable('node_variable') + edge_var = Variable('edge_variable') + gp = GraphPattern([ + (node_var, edge_var, SOURCE_VAR), + (SOURCE_VAR, wikilink, TARGET_VAR) + ]) + filter_node_count = 10 + filter_edge_count = 1 + limit_res = 32 + vars_ = {SOURCE_VAR,TARGET_VAR} + res = GraphPattern.to_find_edge_var_for_narrow_path_query(gp, edge_var, node_var, + vars_, filter_node_count, + filter_edge_count, limit_res) + print(gp) + print(res) + + def test_simplify_pattern(): gp = GraphPattern([(SOURCE_VAR, wikilink, TARGET_VAR)]) res = mutate_simplify_pattern(gp) @@ -270,3 +300,8 @@ def test_remaining_gain_sample_gtps(): def test_gtp_scores(): assert gtp_scores - gtp_scores == 0 + + +if __name__ == '__main__': + # test_mutate_deep_narrow_path() + test_to_find_edge_var_for_narrow_path_query() From 62f45c1ee18f04c3d9a41e8a31c343b699db4ef8 Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Fri, 17 Feb 2017 03:26:51 +0100 Subject: [PATCH 2/9] deep narrow paths mutation conf var docs, cleanup and renames --- config/defaults.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index 370067c..2c91c34 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -80,11 +80,11 @@ MUTPB_SP = 0.05 # prob to simplify pattern (warning: can restrict exploration) MUTPB_DN = 0.05 # prob to try a deep and narrow paths mutation MUTPB_DN_MIN_LEN = 2 # minimum length of the deep and narrow paths -MUTPB_DN_MAX_LEN = 10 # absolute max of path length if not stopped by term_pb -MUTPB_DN_TERM_PB = 0.3 # prob to terminate node expansion each step > min_len -MUTPB_DN_FILTER_NODE_COUNT = 10 -MUTPB_DN_FILTER_EDGE_COUNT = 1 -MUTPB_DN_QUERY_LIMIT = 32 +MUTPB_DN_MAX_LEN = 10 # max of path length if not stopped by term_pb +MUTPB_DN_TERM_PB = 0.3 # prob to terminate expansion each step > min_len +MUTPB_DN_MAX_NODE_COUNT = 10 # edge fixations may have <= nodes +MUTPB_DN_MIN_EDGE_COUNT = 2 # edges need to be valid for >= GTPs +MUTPB_DN_QUERY_LIMIT = 32 # SPARQL query limit for top edge fixations # for import in helpers and __init__ __all__ = [_v for _v in globals().keys() if _v.isupper()] From 0ec39a7558dbf104b751a9aa1d589d6839194811 Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Fri, 17 Feb 2017 03:30:36 +0100 Subject: [PATCH 3/9] refactoring of deep-narrow-paths query, sparql (uses template now), and args passing, also tons of minor things --- gp_learner.py | 38 ++++++------ gp_query.py | 54 ++++++++++------- graph_pattern.py | 147 ++++++++++++++++++++++++++--------------------- 3 files changed, 134 insertions(+), 105 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index 68c6c25..ae0895c 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -48,7 +48,7 @@ from gp_query import predict_query from gp_query import query_time_hard_exceeded from gp_query import query_time_soft_exceeded -from gp_query import variable_substitution_deep_narrow_mut_query +from gp_query import dnp_query from gp_query import variable_substitution_query from graph_pattern import canonicalize from graph_pattern import gen_random_var @@ -655,15 +655,12 @@ def mutate_fix_var( def _mutate_deep_narrow_path_helper( - sparql, - timeout, - gtp_scores, - child, - edge_var, - node_var, - gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N, - limit_res=config.MUTPB_DN_QUERY_LIMIT, - sample_n=config.MUTPB_FV_SAMPLE_MAXN, + sparql, timeout, gtp_scores, child, edge_var, node_var, + gtp_sample_n=config.MUTPB_FV_RGTP_SAMPLE_N, + max_node_count=config.MUTPB_DN_MAX_NODE_COUNT, + min_edge_count=config.MUTPB_DN_MIN_EDGE_COUNT, + limit=config.MUTPB_DN_QUERY_LIMIT, + sample_n=config.MUTPB_FV_SAMPLE_MAXN, ): assert isinstance(child, GraphPattern) assert isinstance(gtp_scores, GTPScores) @@ -675,10 +672,15 @@ def _mutate_deep_narrow_path_helper( gtp_sample_n = random.randint(1, gtp_sample_n) ground_truth_pairs = gtp_scores.remaining_gain_sample_gtps( - n=gtp_sample_n) - t, substitution_counts = variable_substitution_deep_narrow_mut_query( - sparql, timeout, child, edge_var, node_var, ground_truth_pairs, - limit_res) + max_n=gtp_sample_n) + t, substitution_counts = dnp_query( + sparql, timeout, child, ground_truth_pairs, + edge_var=edge_var, + node_var=node_var, + max_node_count=max_node_count, + min_edge_count=min_edge_count, + limit=limit, + ) edge_count, node_sum_count = substitution_counts if not node_sum_count: # the current pattern is unfit, as we can't find anything fulfilling it @@ -698,7 +700,7 @@ def _mutate_deep_narrow_path_helper( for edge, node_sum in node_sum_count.items(): ec = edge_count[edge] prio[edge] = ec / (node_sum / ec) # ec / AVG degree - # randomly pick n of the substitutions with a prob ~ to their counts + # randomly pick n of the substitutions with a prob ~ to their prios edges, prios = zip(*prio.most_common()) substs = sample_from_list(edges, prios, sample_n) @@ -715,9 +717,9 @@ def _mutate_deep_narrow_path_helper( fixed = True orig_child = child children = [ - GraphPattern(child, mapping={edge_var: subst}) - for subst in substs - ] + GraphPattern(child, mapping={edge_var: subst}) + for subst in substs + ] children = [ c if fit_to_live(c) else orig_child for c in children diff --git a/gp_query.py b/gp_query.py index 10f9003..fd44fd4 100644 --- a/gp_query.py +++ b/gp_query.py @@ -458,43 +458,55 @@ def _var_subst_res_update(res, update, **_): res += update -def variable_substitution_deep_narrow_mut_query( - sparql, timeout, graph_pattern, edge_var, node_var, - source_target_pairs, limit_res, batch_size=config.BATCH_SIZE): +def dnp_query( + sparql, timeout, graph_pattern, source_target_pairs, + edge_var, node_var, max_node_count, min_edge_count, limit, + batch_size=config.BATCH_SIZE +): _vars, _values, _ret_val_mapping = _get_vars_values_mapping( graph_pattern, source_target_pairs) - _edge_var_node_var_and_vars = (edge_var, node_var, _vars) return _multi_query( sparql, timeout, graph_pattern, source_target_pairs, batch_size, - _edge_var_node_var_and_vars, _values, _ret_val_mapping, - _var_subst_dnp_res_init, _var_subst_dnp_chunk_q, - _var_subst_dnp_chunk_result_ext, - _res_update=_var_subst_dnp_update, - limit=limit_res, + _vars, _values, _ret_val_mapping, + _dnp_res_init, _dnp_chunk_q, + _dnp_chunk_result_ext, + _res_update=_dnp_res_update, + edge_var=edge_var, + node_var=node_var, + max_node_count=max_node_count, + min_edge_count=min_edge_count, + limit=limit, # non standard, passed via **kwds, see handling below ) # noinspection PyUnusedLocal -def _var_subst_dnp_res_init(_, **kwds): +def _dnp_res_init(_, **kwds): return Counter(), Counter() -def _var_subst_dnp_chunk_q(gp, _edge_var_node_var_and_vars, - values_chunk, limit): - edge_var, node_var, _vars = _edge_var_node_var_and_vars - return gp.to_find_edge_var_for_narrow_path_query( +def _dnp_chunk_q( + gp, _vars, values_chunk, + edge_var, node_var, max_node_count, min_edge_count, limit, + **_ +): + return gp.to_deep_narrow_path_query( edge_var=edge_var, node_var=node_var, vars_=_vars, values={_vars: values_chunk}, - limit_res=limit) + max_node_count=max_node_count, + min_edge_count=min_edge_count, + limit=limit, + ) # noinspection PyUnusedLocal -def _var_subst_dnp_chunk_result_ext( - q_res, _edge_var_node_var_and_vars, _, **kwds): - edge_var, node_var, _vars = _edge_var_node_var_and_vars +def _dnp_chunk_result_ext( + q_res, _vars, _, + edge_var, + **kwds +): chunk_edge_count, chunk_node_sum = Counter(), Counter() res_rows_path = ['results', 'bindings'] bindings = sparql_json_result_bindings_to_rdflib( @@ -510,14 +522,12 @@ def _var_subst_dnp_chunk_result_ext( return chunk_edge_count, chunk_node_sum, -def _var_subst_dnp_update(res, up, **_): +def _dnp_res_update(res, up, **_): edge_count, node_sum_count = res - try: + if up: chunk_edge_count, chunk_node_sum = up edge_count.update(chunk_edge_count) node_sum_count.update(chunk_node_sum) - except ValueError: - pass def generate_stps_from_gp(sparql, gp): diff --git a/graph_pattern.py b/graph_pattern.py index 9990f1e..0c7833e 100644 --- a/graph_pattern.py +++ b/graph_pattern.py @@ -16,6 +16,7 @@ import logging import random import string +import textwrap import deap import deap.base @@ -31,7 +32,6 @@ import six from utils import URIShortener -import config logger = logging.getLogger(__name__) @@ -717,84 +717,101 @@ def to_count_var_over_values_query(self, var, vars_, values, limit): res += 'LIMIT %d\n' % limit return self._sparql_prefix(res) - def to_find_edge_var_for_narrow_path_query( - self, edge_var, node_var, vars_, values, limit_res, - filter_node_count=config.MUTPB_DN_FILTER_NODE_COUNT, - filter_edge_count=config.MUTPB_DN_FILTER_EDGE_COUNT, + def to_deep_narrow_path_query( + self, edge_var, node_var, vars_, values, + limit, max_node_count, min_edge_count, ): - """Counts possible substitutions for edge_var to get a narrow path + """Counts possible substitutions for edge_var to get a narrow path. Meant to perform a query like this: - SELECT * - { + PREFIX dbr: + SELECT * WHERE { { - SELECT - ?edge_var - (COUNT(*) AS ?edge_var_count) - (MAX(?node_var_count) AS ?max_node_count) - (COUNT(*)/AVG(?node_var_count) as ?prio_var) - { - SELECT DISTINCT - ?source ?target ?edge_var (COUNT(?node_var) AS ?node_var_count) - { - VALUES (?source ?target) { - (dbr:Adolescence dbr:Youth) - (dbr:Adult dbr:Child) - (dbr:Angel dbr:Heaven) - (dbr:Arithmetic dbr:Mathematics) - } - ?node_var ?edge_var ?source . - ?source dbo:wikiPageWikiLink ?target . - } + SELECT ?edge_var + (SUM(?node_var_count) AS ?node_var_sum) + (COUNT(?source && ?target) AS ?edge_var_count) + (MAX(?node_var_count) AS ?max_node_count) + WHERE { + SELECT DISTINCT ?source ?target ?edge_var + (COUNT(?node_var) AS ?node_var_count) + WHERE { + VALUES (?source ?target) { + (dbr:Barrel dbr:Wine) + (dbr:Barrister dbr:Law) + (dbr:Beak dbr:Bird) + (dbr:Blanket dbr:Bed) + } + ?node_var ?edge_var ?source . + ?source ?target . } - GROUP BY ?edge_var - ORDER BY DESC(?edge_var_count) + } + GROUP BY ?edge_var } - FILTER(?max_node_count < 10 && ?edge_var_count > 1) - } - ORDER BY DESC(?prio_var) - LIMIT 32 + FILTER(?max_node_count <= 10 + && ?edge_var_count >= 2) + } + ORDER BY DESC(?edge_var_count) ASC(?node_var_sum) + LIMIT 32 + + The idea here is to expand a random node (?source in the example above) + with new variable triple and then try to fix its edge in a way that the + degree (?node_var_count) isn't too high (<= max_node_count). We're also + interested in the avg degree being low. In light of query chunking the + sum is returned here (instead of AVG). + + Apart from minimizing the degrees, we would also like to maximize the + number of stps an ?edge_var fixation is valid for (?edge_var_count). + + See gp_learner.mutate_deep_narrow_path() for more. :param edge_var: Edge variable to find substitution for. :param node_var: Node variable to count. :param vars_: List of vars to fix values for (e.g. ?source, ?target). :param values: List of value lists for vars_. - :param filter_node_count: Filter on node count of edge variable. - :param filter_edge_count: Filter for edge count of triples. - :param limit_res : limit result size + :param max_node_count: Filter on node count of edge variable. + :param min_edge_count: Filter for edge count of triples. + :param limit : limit result size. :return: Query String. """ - res = 'SELECT * WHERE {\n' - res += ' {\n'\ - ' SELECT %s (SUM (?node_var_count) AS %s) (COUNT(%s) AS %s) ' \ - '(MAX(?node_var_count) AS ?max_node_count) WHERE {\n' % ( - edge_var.n3(), - NODE_VAR_SUM.n3(), - ' && '.join([v.n3() for v in vars_]), - EDGE_VAR_COUNT.n3(), ) - res += ' SELECT DISTINCT %s %s (COUNT(%s) AS ?node_var_count) ' \ - 'WHERE {\n ' % (' '.join([v.n3() for v in vars_]), - edge_var.n3(), node_var.n3(), ) - res += self._sparql_values_part(values) - - # triples part - tres = [] - for s, p, o in self: - tres.append('%s %s %s .' % (s.n3(), p.n3(), o.n3())) - indent = ' ' * 3 - triples = indent + ('\n' + indent).join(tres) + '\n' - res += triples - res += ' }\n'\ - ' }\n' - res += ' GROUP BY %s\n' % edge_var.n3() - res += ' }\n' - res += ' FILTER(?max_node_count < %d && %s > %d)\n' \ - % (filter_node_count, EDGE_VAR_COUNT.n3(), - filter_edge_count) - res += '}\n' - res += 'ORDER BY ASC(%s)\n' % NODE_VAR_SUM.n3() - res += 'LIMIT %d' % limit_res + res = '''\ + SELECT * WHERE { + { + SELECT %(edge_var)s + (SUM(?node_var_count) AS %(node_var_sum)s) + (COUNT(%(vars_and)s) AS %(edge_var_count)s) + (MAX(?node_var_count) AS ?max_node_count) + WHERE { + SELECT DISTINCT %(vars)s %(edge_var)s + (COUNT(%(node_var)s) AS ?node_var_count) + WHERE {\n%(values_part)s %(triples)s + } + } + GROUP BY %(edge_var)s + } + FILTER(?max_node_count <= %(max_node_count)d + && %(edge_var_count)s >= %(min_edge_count)d) + } + ORDER BY DESC(%(edge_var_count)s) ASC(%(node_var_sum)s) + LIMIT %(limit)d + ''' % { + # TODO: adapt self._sparql_values_part for template use (indent) + 'edge_var': edge_var.n3(), + 'node_var_sum': NODE_VAR_SUM.n3(), + 'vars_and': ' && '.join([v.n3() for v in vars_]), + 'edge_var_count': EDGE_VAR_COUNT.n3(), + 'vars': ' '.join([v.n3() for v in vars_]), + 'node_var': node_var.n3(), + 'values_part': self._sparql_values_part( + values, indent=' '), + 'triples': '\n '.join( + '%s %s %s .' % (s.n3(), p.n3(), o.n3()) for s, p, o in self + ), + 'limit': limit, + 'max_node_count': max_node_count, + 'min_edge_count': min_edge_count, + } + res = textwrap.dedent(res) return self._sparql_prefix(res) def to_dict(self): From a20eca60438812ce7f0aaadfbfe61e6f3815ee0e Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Fri, 17 Feb 2017 03:31:12 +0100 Subject: [PATCH 4/9] test_mutate_deep_narrow_path moved to online tests --- tests/test_gp_learner_online.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_gp_learner_online.py b/tests/test_gp_learner_online.py index 54c2039..85742f6 100644 --- a/tests/test_gp_learner_online.py +++ b/tests/test_gp_learner_online.py @@ -15,6 +15,7 @@ from config import SPARQL_ENDPOINT from gp_learner import evaluate +from gp_learner import mutate_deep_narrow_path from gp_learner import mutate_fix_var from gp_learner import update_individuals from gp_query import calibrate_query_timeout @@ -134,6 +135,17 @@ def test_mutate_fix_var(): assert gp.vars_in_graph - tgp.vars_in_graph +def test_mutate_deep_narrow_path(): + p = Variable('p') + gp = GraphPattern([ + (SOURCE_VAR, p, TARGET_VAR) + ]) + child = mutate_deep_narrow_path(gp, sparql, timeout, gtp_scores) + assert gp == child or len(child) > len(gp) + print(gp) + print(child) + + def test_timeout_pattern(): u = URIRef('http://dbpedia.org/resource/Template:Reflist') wpdisambig = URIRef('http://dbpedia.org/ontology/wikiPageDisambiguates') @@ -158,3 +170,7 @@ def test_timeout_pattern(): assert fitness.f_measure == 0 else: assert fitness.f_measure > 0 + + +if __name__ == '__main__': + test_mutate_deep_narrow_path() From 72fc46a527346b46a65814b7d8e2c2cc4b26bae7 Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Fri, 17 Feb 2017 03:34:14 +0100 Subject: [PATCH 5/9] test_deep_narrow_paths_query now checks formed query string against doc string example --- tests/test_gp_learner_offline.py | 52 +++++++++++++++++--------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/tests/test_gp_learner_offline.py b/tests/test_gp_learner_offline.py index 057be67..3f948d7 100644 --- a/tests/test_gp_learner_offline.py +++ b/tests/test_gp_learner_offline.py @@ -4,6 +4,7 @@ from collections import Counter import logging import random +import textwrap import rdflib from rdflib import URIRef @@ -13,7 +14,6 @@ from gp_learner import mutate_increase_dist from gp_learner import mutate_merge_var from gp_learner import mutate_simplify_pattern -from gp_learner import mutate_deep_narrow_path from graph_pattern import GraphPattern from graph_pattern import SOURCE_VAR from graph_pattern import TARGET_VAR @@ -109,33 +109,36 @@ def test_mutate_merge_var(): assert False, "merge never reached one of the cases: %s" % cases -def test_mutate_deep_narrow_path(): - p = Variable('p') - gp = GraphPattern([ - (SOURCE_VAR, p, TARGET_VAR) - ]) - child = mutate_deep_narrow_path(gp) - assert gp == child or len(child) > len(gp) - print(gp) - print(child) - +def test_deep_narrow_path_query(): + node_var = Variable('node_var') + edge_var = Variable('edge_var') + gtps = [ + (dbp['Barrel'], dbp['Wine']), + (dbp['Barrister'], dbp['Law']), + (dbp['Beak'], dbp['Bird']), + (dbp['Blanket'], dbp['Bed']), + ] -def test_to_find_edge_var_for_narrow_path_query(): - node_var = Variable('node_variable') - edge_var = Variable('edge_variable') gp = GraphPattern([ (node_var, edge_var, SOURCE_VAR), (SOURCE_VAR, wikilink, TARGET_VAR) ]) - filter_node_count = 10 - filter_edge_count = 1 - limit_res = 32 - vars_ = {SOURCE_VAR,TARGET_VAR} - res = GraphPattern.to_find_edge_var_for_narrow_path_query(gp, edge_var, node_var, - vars_, filter_node_count, - filter_edge_count, limit_res) - print(gp) - print(res) + + vars_ = (SOURCE_VAR, TARGET_VAR) + res = gp.to_deep_narrow_path_query( + edge_var, node_var, vars_, {vars_: gtps}, + limit=32, + max_node_count=10, + min_edge_count=2, + ).strip() + doc = gp.to_deep_narrow_path_query.__doc__ + doc_str_example_query = "\n".join([ + l for l in doc.splitlines() + if l.startswith(' ') + ]) + doc_str_example_query = textwrap.dedent(doc_str_example_query) + assert res == doc_str_example_query, \ + "res:\n%s\n\ndoes not look like:\n\n%s" % (res, doc_str_example_query) def test_simplify_pattern(): @@ -303,5 +306,4 @@ def test_gtp_scores(): if __name__ == '__main__': - # test_mutate_deep_narrow_path() - test_to_find_edge_var_for_narrow_path_query() + test_deep_narrow_path_query() From 3e7b352e491b9dfbe65b5af0908a875446a64b21 Mon Sep 17 00:00:00 2001 From: Nandika Kalra Date: Mon, 27 Feb 2017 18:01:34 +0100 Subject: [PATCH 6/9] WIP: added backtracking to deep narrow path mutation --- config/defaults.py | 2 ++ gp_learner.py | 34 ++++++++++++++++++++++++++++++---- 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index 2c91c34..1d483a8 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -85,6 +85,8 @@ MUTPB_DN_MAX_NODE_COUNT = 10 # edge fixations may have <= nodes MUTPB_DN_MIN_EDGE_COUNT = 2 # edges need to be valid for >= GTPs MUTPB_DN_QUERY_LIMIT = 32 # SPARQL query limit for top edge fixations +MUTPB_DN_LOOK_AHEAD_LIMIT = 2 +MUTPB_DN_RECURSION_LIMIT = 4 # for import in helpers and __init__ __all__ = [_v for _v in globals().keys() if _v.isupper()] diff --git a/gp_learner.py b/gp_learner.py index ae0895c..12da464 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -723,7 +723,7 @@ def _mutate_deep_narrow_path_helper( children = [ c if fit_to_live(c) else orig_child for c in children - ] + ] if children: child = random.choice(list(children)) return child, fixed @@ -731,16 +731,23 @@ def _mutate_deep_narrow_path_helper( def mutate_deep_narrow_path( child, sparql, timeout, gtp_scores, + _rec_depth=0, + start_node=None, min_len=config.MUTPB_DN_MIN_LEN, max_len=config.MUTPB_DN_MAX_LEN, term_pb=config.MUTPB_DN_TERM_PB, + recursion_look_ahead=config.MUTPB_DN_LOOK_AHEAD_LIMIT, + rec_limit=config.MUTPB_DN_RECURSION_LIMIT, ): assert isinstance(child, GraphPattern) nodes = list(child.nodes) - start_node = random.choice(nodes) - # target_nodes = set(nodes) - {start_node} + if start_node is None: + start_node = random.choice(nodes) + fixed_for_start_node = start_node + fixed_gp = child gp = child hop = 0 + false_fixed_count = 0 while True: if hop >= min_len and random.random() < term_pb: break @@ -748,10 +755,29 @@ def mutate_deep_narrow_path( break hop += 1 new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) + orig_gp = gp gp += [new_triple] gp, fixed = _mutate_deep_narrow_path_helper( sparql, timeout, gtp_scores, gp, var_edge, var_node) - start_node = var_node + if fixed: + fixed_for_start_node = start_node + fixed_gp = orig_gp + false_fixed_count = 0 + start_node = var_node + if not fixed: + false_fixed_count += 1 + if false_fixed_count > recursion_look_ahead: + _rec_depth += 1 + if _rec_depth > rec_limit: + return gp + start_node = fixed_for_start_node + gp = mutate_deep_narrow_path( + fixed_gp, sparql, timeout, gtp_scores, + _rec_depth, + start_node=start_node + ) + return gp + start_node = var_node return gp From 8595524a32f0a35e7c2ec9e8d1fb3e886e372430 Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Fri, 3 Mar 2017 19:33:15 +0100 Subject: [PATCH 7/9] mutate helper tells what it fixed a var to and actually orders by prios --- gp_learner.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index 12da464..a1fb813 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -704,10 +704,10 @@ def _mutate_deep_narrow_path_helper( edges, prios = zip(*prio.most_common()) substs = sample_from_list(edges, prios, sample_n) - logger.info( - 'fixed variable %s in %sto:\n %s\n<%d out of:\n%s\n', + 'fixed variable %s to %s in %s\n %s\n<%d out of:\n%s\n', edge_var.n3(), + substs[0] if substs else '', child, '\n '.join([subst.n3() for subst in substs]), sample_n, @@ -715,17 +715,15 @@ def _mutate_deep_narrow_path_helper( ' %.3f: %s' % (c, v.n3()) for v, c in prio.most_common()]), ) fixed = True - orig_child = child children = [ GraphPattern(child, mapping={edge_var: subst}) for subst in substs ] children = [ - c if fit_to_live(c) else orig_child - for c in children + c for c in children if fit_to_live(c) ] if children: - child = random.choice(list(children)) + child = children[0] return child, fixed From b479257488406e408248c1fb8d71b36561effbcd Mon Sep 17 00:00:00 2001 From: Joern Hees Date: Fri, 3 Mar 2017 19:34:56 +0100 Subject: [PATCH 8/9] made the deep_narrow_paths mutation recursive with retry attemps --- config/defaults.py | 5 ++-- gp_learner.py | 62 +++++++++++++++++++--------------------------- 2 files changed, 27 insertions(+), 40 deletions(-) diff --git a/config/defaults.py b/config/defaults.py index 1d483a8..2fdfe07 100644 --- a/config/defaults.py +++ b/config/defaults.py @@ -81,12 +81,11 @@ MUTPB_DN = 0.05 # prob to try a deep and narrow paths mutation MUTPB_DN_MIN_LEN = 2 # minimum length of the deep and narrow paths MUTPB_DN_MAX_LEN = 10 # max of path length if not stopped by term_pb -MUTPB_DN_TERM_PB = 0.3 # prob to terminate expansion each step > min_len +MUTPB_DN_TERM_PB = 0.7 # prob to terminate expansion each step > min_len MUTPB_DN_MAX_NODE_COUNT = 10 # edge fixations may have <= nodes MUTPB_DN_MIN_EDGE_COUNT = 2 # edges need to be valid for >= GTPs MUTPB_DN_QUERY_LIMIT = 32 # SPARQL query limit for top edge fixations -MUTPB_DN_LOOK_AHEAD_LIMIT = 2 -MUTPB_DN_RECURSION_LIMIT = 4 +MUTPB_DN_REC_RETRIES = 3 # retrial attempts in each recursion, WARNING: EXP! # for import in helpers and __init__ __all__ = [_v for _v in globals().keys() if _v.isupper()] diff --git a/gp_learner.py b/gp_learner.py index a1fb813..f3c18c5 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -734,49 +734,37 @@ def mutate_deep_narrow_path( min_len=config.MUTPB_DN_MIN_LEN, max_len=config.MUTPB_DN_MAX_LEN, term_pb=config.MUTPB_DN_TERM_PB, - recursion_look_ahead=config.MUTPB_DN_LOOK_AHEAD_LIMIT, - rec_limit=config.MUTPB_DN_RECURSION_LIMIT, + retries=config.MUTPB_DN_REC_RETRIES, ): assert isinstance(child, GraphPattern) - nodes = list(child.nodes) - if start_node is None: + assert min_len > 0 + if _rec_depth > max_len: + return None + if _rec_depth >= min_len and random.random() < term_pb: + return None + if not start_node: + nodes = list(child.nodes) start_node = random.choice(nodes) - fixed_for_start_node = start_node - fixed_gp = child + gp = child - hop = 0 - false_fixed_count = 0 - while True: - if hop >= min_len and random.random() < term_pb: - break - if hop >= max_len: - break - hop += 1 - new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) - orig_gp = gp - gp += [new_triple] - gp, fixed = _mutate_deep_narrow_path_helper( + new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) + gp += [new_triple] + for r in range(retries): + fixed_gp, fixed = _mutate_deep_narrow_path_helper( sparql, timeout, gtp_scores, gp, var_edge, var_node) + rec_gp = mutate_deep_narrow_path( + fixed_gp, sparql, timeout, gtp_scores, + _rec_depth+1, + start_node=var_node + ) + if rec_gp: + return rec_gp if fixed: - fixed_for_start_node = start_node - fixed_gp = orig_gp - false_fixed_count = 0 - start_node = var_node - if not fixed: - false_fixed_count += 1 - if false_fixed_count > recursion_look_ahead: - _rec_depth += 1 - if _rec_depth > rec_limit: - return gp - start_node = fixed_for_start_node - gp = mutate_deep_narrow_path( - fixed_gp, sparql, timeout, gtp_scores, - _rec_depth, - start_node=start_node - ) - return gp - start_node = var_node - return gp + if _rec_depth > min_len: + return fixed_gp + if _rec_depth == 0: + return child + return None def mutate_simplify_pattern(gp): From 95440838497a6482b6ce42f7a9b152b15aceba7c Mon Sep 17 00:00:00 2001 From: Nandika Kalra Date: Mon, 6 Mar 2017 19:18:40 +0100 Subject: [PATCH 9/9] deep_narrow_paths mutation now tries to connect expanded gp to target nodes --- gp_learner.py | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/gp_learner.py b/gp_learner.py index f3c18c5..d463cd0 100644 --- a/gp_learner.py +++ b/gp_learner.py @@ -731,9 +731,11 @@ def mutate_deep_narrow_path( child, sparql, timeout, gtp_scores, _rec_depth=0, start_node=None, + target_nodes = None, min_len=config.MUTPB_DN_MIN_LEN, max_len=config.MUTPB_DN_MAX_LEN, term_pb=config.MUTPB_DN_TERM_PB, + pb_en_out_link=config.MUTPB_EN_OUT_LINK, retries=config.MUTPB_DN_REC_RETRIES, ): assert isinstance(child, GraphPattern) @@ -742,9 +744,24 @@ def mutate_deep_narrow_path( return None if _rec_depth >= min_len and random.random() < term_pb: return None - if not start_node: - nodes = list(child.nodes) - start_node = random.choice(nodes) + if _rec_depth == 0: + nodes = child.nodes + if not start_node: + start_node = random.choice(list(nodes)) + target_nodes = list(nodes - {start_node}) + if _rec_depth >=min_len: + closed_gp = child + for node in target_nodes: + var_edge_to_target = gen_random_var() + if random.random() < pb_en_out_link: + new_triple = (start_node, var_edge_to_target, node) + else: + new_triple = (node, var_edge_to_target, start_node) + closed_gp += [new_triple] + closed_gp, fixed_edge_to_target = _mutate_deep_narrow_path_helper( + sparql, timeout, gtp_scores, closed_gp,var_edge_to_target, node) + if fixed_edge_to_target: + return closed_gp gp = child new_triple, var_node, var_edge = _mutate_expand_node_helper(start_node) @@ -755,7 +772,8 @@ def mutate_deep_narrow_path( rec_gp = mutate_deep_narrow_path( fixed_gp, sparql, timeout, gtp_scores, _rec_depth+1, - start_node=var_node + start_node=var_node, + target_nodes = target_nodes, ) if rec_gp: return rec_gp