Skip to content

Commit

Permalink
Always store texts in license and rules #3067
Browse files Browse the repository at this point in the history
This way we do not have paths stored at all.
This requires a fairly significant code change.
- License/Rule data_file and text_file are now methods.
- License/Rule stored_text is gone and text is a field
  and not a property
- loading and dumping requires a location.

Signed-off-by: Philippe Ombredanne <[email protected]>
  • Loading branch information
pombredanne committed Aug 26, 2022
1 parent 4f619e5 commit 2b4e561
Show file tree
Hide file tree
Showing 34 changed files with 745 additions and 875 deletions.
18 changes: 5 additions & 13 deletions etc/scripts/licenses/buildrules.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,8 +156,8 @@ def all_rule_by_tokens():
try:
rule_tokens[tuple(rule.tokens())] = rule.identifier
except Exception as e:
df = f" file://{rule.data_file}"
tf = f" file://{rule.text_file}"
df = f" file://{rule.data_file()}"
tf = f" file://{rule.text_file()}"
raise Exception(
f"Failed to to get tokens from rule:: {rule.identifier}\n" f"{df}\n{tf}"
) from e
Expand Down Expand Up @@ -211,7 +211,7 @@ def cli(licenses_file):
rdata.data["has_stored_minimum_coverage"] = bool(minimum_coverage)

rl = models.BasicRule(**rdata.data)
rl.stored_text = rdata.text
rl.text = rdata.text
skinny_rules.append(rl)

models.validate_rules(skinny_rules, licenses_by_key, with_text=True)
Expand All @@ -226,7 +226,7 @@ def cli(licenses_file):
else:
base_name = rule.license_expression

text = rule.text()
text = rule.text

existing_rule = rule_exists(text)
skinny_text = " ".join(text[:80].split()).replace("{", " ").replace("}", " ")
Expand All @@ -244,7 +244,7 @@ def cli(licenses_file):
base_loc = find_rule_base_loc(base_name)

rd = rule.to_dict()
rd["stored_text"] = rule.stored_text
rd["text"] = rule.text
rd["has_stored_relevance"] = rule.has_stored_relevance
rd["has_stored_minimum_coverage"] = rule.has_stored_minimum_coverage

Expand All @@ -253,9 +253,6 @@ def cli(licenses_file):
# force recomputing relevance to remove junk stored relevance for long rules
rulerec.set_relevance()

rulerec.data_file = base_loc + ".yml"
rulerec.text_file = base_loc + ".RULE"

rule_tokens = tuple(rulerec.tokens())

existing_rule = rule_by_tokens.get(rule_tokens)
Expand All @@ -264,11 +261,6 @@ def cli(licenses_file):
continue
else:
print(f"Adding new rule: {base_name}")
print(" file://" + rulerec.data_file)
print(
" file://" + rulerec.text_file,
)
rulerec.dump()
models.update_ignorables(rulerec, verbose=False)
rulerec.dump()

Expand Down
6 changes: 3 additions & 3 deletions etc/scripts/licenses/report_license_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def cli(licenses, rules, category, license_key, with_text):
if with_text:
license_data["text"] = lic.text[:200]
license_data["is_unknown"] = lic.is_unknown
license_data["words_count"] = len(lic.text)
license_data["length"] = len(lic.text)
license_data["reference_url"] = SCANCODE_LICENSEDB_URL.format(lic.key)
licenses_output.append(license_data)

Expand All @@ -210,9 +210,9 @@ def cli(licenses, rules, category, license_key, with_text):
rule_data["identifier"] = rule.identifier
rule_data["referenced_filenames"] = rule.referenced_filenames
if with_text:
rule_data["text"] = rule.text()[:200]
rule_data["text"] = rule.text[:200]
rule_data["has_unknown"] = rule.has_unknown
rule_data["words_count"] = len(rule.text())
rule_data["length"] = len(rule.text)
try:
rule_data["category"] = licenses_data[rule_data["license_expression"]].category
except KeyError:
Expand Down
4 changes: 1 addition & 3 deletions etc/scripts/licenses/synclic.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,7 @@ def get_licenses(
start = time.time()

try:
with io.open(lic.text_file, "w", encoding="utf-8") as tf:
tf.write(text)
lic.dump()
lic.dump(licenses_data_dir=self.original_dir)
licenses.append(lic)
except:
if TRACE:
Expand Down
22 changes: 12 additions & 10 deletions src/licensedcode/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
TRACE_APPROX = False
TRACE_APPROX_CANDIDATES = False
TRACE_APPROX_MATCHES = False
TRACE_INDEXING = False or os.environ.get('SCANCODE_DEBUG_LICENSE_INDEX', False)
TRACE_INDEXING_PERF = False
TRACE_TOKEN_DOC_FREQ = False
TRACE_SPDX_LID = False
Expand All @@ -63,6 +64,7 @@ def logger_debug(*args):
or TRACE_APPROX
or TRACE_APPROX_CANDIDATES
or TRACE_APPROX_MATCHES
or TRACE_INDEXING
or TRACE_INDEXING_PERF
or TRACE_SPDX_LID
):
Expand Down Expand Up @@ -304,6 +306,10 @@ def _add_rules(
dictionary[sts] = stid

self.rules_by_rid = rules_by_rid = list(rules)
if TRACE_INDEXING:
for _rid, _rule in enumerate(rules_by_rid):
logger_debug('rules_by_rid:', _rid, _rule)

# ensure that rules are sorted
rules_by_rid.sort()
len_rules = len(rules_by_rid)
Expand Down Expand Up @@ -560,16 +566,12 @@ def _add_rules(

dupe_rules = [rules for rules in dupe_rules_by_hash.values() if len(rules) > 1]
if dupe_rules:
dupe_rule_paths = [
'\n'.join(
sorted([
('file://' + rule.text_file)
if rule.text_file
else ('text: ' + rule.stored_text)
for rule in rules])
)
for rules in dupe_rules
]
dupe_rule_paths = []
for rules in dupe_rules:
drp = [rule.identifier for rule in rules]
drp.sort()
dupe_rule_paths.append('\n'.join(drp))

msg = ('Duplicate rules: \n' + '\n\n'.join(dupe_rule_paths))
raise DuplicateRuleError(msg)

Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -1783,7 +1783,7 @@ def filter_invalid_matches_to_single_word_gibberish(
highlight=False,
).strip()

rule_text = rule.text().strip()
rule_text = rule.prepare_text()

if trace:
logger_debug(
Expand Down
10 changes: 8 additions & 2 deletions src/licensedcode/match_aho.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
"""

# Set to True to enable debug tracing
TRACE = False
TRACE = True
TRACE_FRAG = False
TRACE_DEEP = False
TRACE_DEEP = True

if TRACE or TRACE_FRAG:
import logging
Expand Down Expand Up @@ -93,7 +93,13 @@ def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
qbegin = query_run.start

matched_positions = get_matched_positions(query_run.tokens, qbegin, automaton)
if TRACE:
matched_positions = list(matched_positions)
logger_debug(' ##exact_AHO: matched_positions', matched_positions)
matched_spans = get_matched_spans(matched_positions, query_run.matchables)
if TRACE:
matched_spans = list(matched_spans)
logger_debug(' ##exact_AHO: matched_spans', matched_spans)

len_legalese = idx.len_legalese
rules_by_rid = idx.rules_by_rid
Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/match_spdx_lid.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def spdx_id_match(idx, query_run, text, expression_symbols=None):
# Alternatively we could use the expression string, padded with
# spdx-license-identifier: this may be wrong too, if the line was
# not padded originally with this tag
stored_text=text,
text=text,
length=match_len,
)

Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/match_unknown.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def get_tokens(_toks):
print('match_unknowns: text', text)

# ... and use this in a synthetic UnknownRule
rule = UnknownRule(stored_text=text, length=match_len)
rule = UnknownRule(text=text, length=match_len)

# finally craft a LicenseMatch and return
len_legalese = idx.len_legalese
Expand Down
Loading

0 comments on commit 2b4e561

Please sign in to comment.