Always store texts in license and rules #3067

This way we do not have paths stored at all. This requires a fairly significant code change. - License/Rule data_file and text_file are now methods. - License/Rule stored_text is gone and text is a field and not a property - loading and dumping requires a location. Signed-off-by: Philippe Ombredanne <[email protected]>
aboutcode-org · Aug 26, 2022 · 2b4e561 · 2b4e561
1 parent 4f619e5
commit 2b4e561
Show file tree

Hide file tree

Showing 34 changed files with 745 additions and 875 deletions.
diff --git a/etc/scripts/licenses/buildrules.py b/etc/scripts/licenses/buildrules.py
@@ -156,8 +156,8 @@ def all_rule_by_tokens():
         try:
             rule_tokens[tuple(rule.tokens())] = rule.identifier
         except Exception as e:
-            df = f"  file://{rule.data_file}"
-            tf = f"  file://{rule.text_file}"
+            df = f"  file://{rule.data_file()}"
+            tf = f"  file://{rule.text_file()}"
             raise Exception(
                 f"Failed to to get tokens from rule:: {rule.identifier}\n" f"{df}\n{tf}"
             ) from e
@@ -211,7 +211,7 @@ def cli(licenses_file):
         rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)
 
         rl = models.BasicRule(**rdata.data)
-        rl.stored_text = rdata.text
+        rl.text = rdata.text
         skinny_rules.append(rl)
 
     models.validate_rules(skinny_rules, licenses_by_key, with_text=True)
@@ -226,7 +226,7 @@ def cli(licenses_file):
         else:
             base_name = rule.license_expression
 
-        text = rule.text()
+        text = rule.text
 
         existing_rule = rule_exists(text)
         skinny_text = " ".join(text[:80].split()).replace("{", " ").replace("}", " ")
@@ -244,7 +244,7 @@ def cli(licenses_file):
         base_loc = find_rule_base_loc(base_name)
 
         rd = rule.to_dict()
-        rd["stored_text"] = rule.stored_text
+        rd["text"] = rule.text
         rd["has_stored_relevance"] = rule.has_stored_relevance
         rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage
 
@@ -253,9 +253,6 @@ def cli(licenses_file):
         # force recomputing relevance to remove junk stored relevance for long rules
         rulerec.set_relevance()
 
-        rulerec.data_file = base_loc + ".yml"
-        rulerec.text_file = base_loc + ".RULE"
-
         rule_tokens = tuple(rulerec.tokens())
 
         existing_rule = rule_by_tokens.get(rule_tokens)
@@ -264,11 +261,6 @@ def cli(licenses_file):
             continue
         else:
             print(f"Adding new rule: {base_name}")
-            print("  file://" + rulerec.data_file)
-            print(
-                "  file://" + rulerec.text_file,
-            )
-            rulerec.dump()
             models.update_ignorables(rulerec, verbose=False)
             rulerec.dump()
 

diff --git a/etc/scripts/licenses/report_license_rules.py b/etc/scripts/licenses/report_license_rules.py
@@ -184,7 +184,7 @@ def cli(licenses, rules, category, license_key, with_text):
             if with_text:
                 license_data["text"] = lic.text[:200]
             license_data["is_unknown"] = lic.is_unknown
-            license_data["words_count"] = len(lic.text)
+            license_data["length"] = len(lic.text)
             license_data["reference_url"] = SCANCODE_LICENSEDB_URL.format(lic.key)
             licenses_output.append(license_data)
 
@@ -210,9 +210,9 @@ def cli(licenses, rules, category, license_key, with_text):
             rule_data["identifier"] = rule.identifier
             rule_data["referenced_filenames"] = rule.referenced_filenames
             if with_text:
-                rule_data["text"] = rule.text()[:200]
+                rule_data["text"] = rule.text[:200]
             rule_data["has_unknown"] = rule.has_unknown
-            rule_data["words_count"] = len(rule.text())
+            rule_data["length"] = len(rule.text)
             try:
                 rule_data["category"] = licenses_data[rule_data["license_expression"]].category
             except KeyError:

diff --git a/etc/scripts/licenses/synclic.py b/etc/scripts/licenses/synclic.py
@@ -147,9 +147,7 @@ def get_licenses(
                 start = time.time()
 
             try:
-                with io.open(lic.text_file, "w", encoding="utf-8") as tf:
-                    tf.write(text)
-                lic.dump()
+                lic.dump(licenses_data_dir=self.original_dir)
                 licenses.append(lic)
             except:
                 if TRACE:

diff --git a/src/licensedcode/index.py b/src/licensedcode/index.py
@@ -49,6 +49,7 @@
 TRACE_APPROX = False
 TRACE_APPROX_CANDIDATES = False
 TRACE_APPROX_MATCHES = False
+TRACE_INDEXING = False or os.environ.get('SCANCODE_DEBUG_LICENSE_INDEX', False)
 TRACE_INDEXING_PERF = False
 TRACE_TOKEN_DOC_FREQ = False
 TRACE_SPDX_LID = False
@@ -63,6 +64,7 @@ def logger_debug(*args):
     or TRACE_APPROX
     or TRACE_APPROX_CANDIDATES
     or TRACE_APPROX_MATCHES
+    or TRACE_INDEXING
     or TRACE_INDEXING_PERF
     or TRACE_SPDX_LID
 ):
@@ -304,6 +306,10 @@ def _add_rules(
                 dictionary[sts] = stid
 
         self.rules_by_rid = rules_by_rid = list(rules)
+        if TRACE_INDEXING:
+            for _rid, _rule in enumerate(rules_by_rid):
+                logger_debug('rules_by_rid:', _rid, _rule)
+
         # ensure that rules are sorted
         rules_by_rid.sort()
         len_rules = len(rules_by_rid)
@@ -560,16 +566,12 @@ def _add_rules(
 
         dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
         if dupe_rules:
-            dupe_rule_paths = [
-                '\n'.join(
-                    sorted([
-                        ('file://' + rule.text_file)
-                        if rule.text_file
-                        else ('text: ' + rule.stored_text)
-                            for rule in rules])
-                    )
-                for rules in dupe_rules
-            ]
+            dupe_rule_paths = []
+            for rules in dupe_rules:
+                drp = [rule.identifier for rule in rules]
+                drp.sort()
+                dupe_rule_paths.append('\n'.join(drp))
+
             msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
             raise DuplicateRuleError(msg)
 

diff --git a/src/licensedcode/match.py b/src/licensedcode/match.py
@@ -1783,7 +1783,7 @@ def filter_invalid_matches_to_single_word_gibberish(
                 highlight=False,
             ).strip()
 
-            rule_text = rule.text().strip()
+            rule_text = rule.prepare_text()
 
             if trace:
                 logger_debug(

diff --git a/src/licensedcode/match_aho.py b/src/licensedcode/match_aho.py
@@ -20,9 +20,9 @@
 """
 
 # Set to True to enable debug tracing
-TRACE = False
+TRACE = True
 TRACE_FRAG = False
-TRACE_DEEP = False
+TRACE_DEEP = True
 
 if TRACE or TRACE_FRAG:
     import logging
@@ -93,7 +93,13 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
     qbegin = query_run.start
 
     matched_positions = get_matched_positions(query_run.tokens, qbegin, automaton)
+    if TRACE:
+        matched_positions = list(matched_positions)
+        logger_debug(' ##exact_AHO: matched_positions', matched_positions)
     matched_spans = get_matched_spans(matched_positions, query_run.matchables)
+    if TRACE:
+        matched_spans = list(matched_spans)
+        logger_debug(' ##exact_AHO: matched_spans', matched_spans)
 
     len_legalese = idx.len_legalese
     rules_by_rid = idx.rules_by_rid

diff --git a/src/licensedcode/match_spdx_lid.py b/src/licensedcode/match_spdx_lid.py
@@ -88,7 +88,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
         # Alternatively we could use the expression string, padded with
         # spdx-license-identifier: this may be wrong too, if the line was
         # not padded originally with this tag
-        stored_text=text,
+        text=text,
         length=match_len,
     )
 

diff --git a/src/licensedcode/match_unknown.py b/src/licensedcode/match_unknown.py
@@ -198,7 +198,7 @@ def get_tokens(_toks):
         print('match_unknowns: text', text)
 
     # ... and use this in a synthetic UnknownRule
-    rule = UnknownRule(stored_text=text, length=match_len)
+    rule = UnknownRule(text=text, length=match_len)
 
     # finally craft a LicenseMatch and return
     len_legalese = idx.len_legalese