Skip to content

Commit

Permalink
Add extra heuristic for false-positives aboutcode-org#29
Browse files Browse the repository at this point in the history
Adds an extra heuristic for false positives detection, based on
line number and rule length, and their related tests. Fixes error
reporting issue and other keyerrors.

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Jan 28, 2021
1 parent 1174844 commit aff7cce
Show file tree
Hide file tree
Showing 5 changed files with 130 additions and 15 deletions.
47 changes: 35 additions & 12 deletions src/results_analyze/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
# (i.e. and therefore, different rule)
LINES_THRESHOLD = 4

# Threshold Values of start line and rule length for a match to likely be a false positive
# (more than the start_line threshold and less than the rule_length threshold)
FALSE_POSITIVE_START_LINE_THRESHOLD = 1000
FALSE_POSITIVE_RULE_LENGTH_THRESHOLD = 3

# Whether to Use the NLP BERT Models
USE_LICENSE_CASE_BERT_MODEL = False
USE_FALSE_POSITIVE_BERT_MODEL = False
Expand Down Expand Up @@ -67,7 +72,7 @@ class AnalysisResult:
"A license rule from the scancode rules matches completely with a part of "
"the text, but there's some extra words which aren't there in the rule"
),
"false-positives": "A piece of code/text is incorrectly detected as a license",
"false-positive": "A piece of code/text is incorrectly detected as a license",
}

ERROR_RULE_TYPE_CHOICES = {
Expand Down Expand Up @@ -98,7 +103,7 @@ class AnalysisResult:
),
"text-lic-text-fragments": "only parts of a larger license text is detected",
# `notice` sub-cases
"notice-and-or-except-notice": (
"notice-and-or-with-notice": (
"a notice which notifies multiple licenses, as exceptions, as a choice "
"between, or as together"
),
Expand All @@ -107,6 +112,9 @@ class AnalysisResult:
"license references with unknown licenses detected i.e. fragments of "
"known license text"
),
"notice-false-positive": (
"A piece of code/text is incorrectly detected as a license"
),
# `tag` sub-cases
"tag-tag-coverage": "a part of a license tag is detected",
"tag-other-tag-structures": (
Expand All @@ -125,6 +133,9 @@ class AnalysisResult:
"matched to an unknown rule as the license information is present in "
"another file, which is referred to in this matched piece of text"
),
"reference-false-positive": (
"A piece of code/text is incorrectly detected as a license"
),
}

def __init__(self):
Expand Down Expand Up @@ -264,9 +275,17 @@ def is_false_positive(license_matches):
:param license_matches: list
List of license matches in a file-region.
"""
match_rule_length_values = (
start_line_region = min(match["start_line"] for match in license_matches)
match_rule_length_values = [
match["matched_rule"]["rule_length"] for match in license_matches
)
]

if start_line_region > FALSE_POSITIVE_START_LINE_THRESHOLD and any(
match_rule_length_value <= FALSE_POSITIVE_RULE_LENGTH_THRESHOLD
for match_rule_length_value in match_rule_length_values
):
return True

match_is_license_tag_flags = (
match["matched_rule"]["is_license_tag"] for match in license_matches
)
Expand Down Expand Up @@ -388,22 +407,24 @@ def get_license_text_sub_type(is_license_text, is_legal):
return "text-lic-text-fragments"


def get_license_notice_sub_type(license_matches):
def get_license_notice_sub_type(license_matches, analysis):

license_expression_connectors = ["AND", "OR", "WITH"]

match_rule_license_expressions = [
match["matched_rule"]["license_expression"] for match in license_matches
]

if all(
if analysis == "false-positive":
return "notice-false-positive"
elif all(
any(
license_expression_connector in license_expression
for license_expression_connector in license_expression_connectors
)
for license_expression in match_rule_license_expressions
):
return "notice-and-or-except-notice"
return "notice-and-or-with-notice"
elif any(
"unknown" in license_expression
for license_expression in match_rule_license_expressions
Expand All @@ -421,13 +442,15 @@ def get_license_tag_sub_type(analysis):
return "tag-tag-coverage"


def get_license_reference_sub_type(license_matches):
def get_license_reference_sub_type(license_matches, analysis):

match_rule_identifiers = [
match["matched_rule"]["identifier"] for match in license_matches
]

if any("lead" in identifier for identifier in match_rule_identifiers):
if analysis == "false-positive":
return "reference-false-positive"
elif any("lead" in identifier for identifier in match_rule_identifiers):
return "reference-lead-in-refs"
elif any("unknown" in identifier for identifier in match_rule_identifiers):
return "reference-has-unknown-match"
Expand All @@ -444,15 +467,15 @@ def get_error_rule_sub_type(
)
elif analysis_result.error_rule_type == "is_license_notice":
analysis_result.error_rule_sub_type = get_license_notice_sub_type(
license_matches
license_matches, analysis_result.analysis
)
elif analysis_result.error_rule_type == "is_license_tag":
analysis_result.error_rule_sub_type = get_license_tag_sub_type(
analysis_result.analysis
)
elif analysis_result.error_rule_type == "is_license_reference":
analysis_result.error_rule_sub_type = get_license_reference_sub_type(
license_matches
license_matches, analysis_result.analysis
)


Expand Down Expand Up @@ -513,7 +536,7 @@ def get_license_match_from_region(group_of_license_matches, analysis_result):
[match] = group_of_license_matches
match = {key: match[key] for key in MATCH_ATTRIBUTES_TO_KEEP}
else:
if analysis_result.error_rule_sub_type == "notice-and-or-except-notice":
if analysis_result.error_rule_sub_type == "notice-and-or-with-notice":
match = group_of_license_matches
else:
match = consolidate_matches_in_one_region(group_of_license_matches)
Expand Down
2 changes: 1 addition & 1 deletion src/results_analyze/analyzer_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ def process_codebase(self, codebase, **kwargs):
resource.license_detection_analysis = [ar.to_dict() for ar in ars]
except Exception as e:
msg = f"Cannot analyze scan for license scan errors: {str(e)}"
resource.errors.append(msg)
resource.scan_errors.append(msg)
codebase.save_resource(resource)


Expand Down
35 changes: 35 additions & 0 deletions tests/data/analyzer/analyze_for_scan_errors_false_positive_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"key": "gpl-1.0-plus",
"score": 100.0,
"name": "GNU General Public License 1.0 or later",
"short_name": "GPL 1.0 or later",
"category": "Copyleft",
"is_exception": false,
"owner": "Free Software Foundation (FSF)",
"homepage_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
"text_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
"reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-1.0-plus",
"spdx_license_key": "GPL-1.0-or-later",
"spdx_url": "https://spdx.org/licenses/GPL-1.0-or-later",
"start_line": 8818,
"end_line": 8820,
"matched_rule": {
"identifier": "gpl_234.RULE",
"license_expression": "gpl-1.0-plus",
"licenses": [
"gpl-1.0-plus"
],
"is_license_text": false,
"is_license_notice": true,
"is_license_reference": false,
"is_license_tag": false,
"matcher": "2-aho",
"rule_length": 3,
"matched_length": 3,
"match_coverage": 100.0,
"rule_relevance": 100.0
},
"matched_text": "- target audience: general public\n\nLicense Changes"
}
]
35 changes: 35 additions & 0 deletions tests/data/analyzer/analyze_for_scan_errors_false_positive_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"key": "gpl-1.0",
"score": 100.0,
"name": "GNU General Public License 1.0",
"short_name": "GPL 1.0",
"category": "Copyleft",
"is_exception": false,
"owner": "Free Software Foundation (FSF)",
"homepage_url": "http://www.gnu.org/licenses/gpl-1.0.html",
"text_url": "http://www.gnu.org/licenses/gpl-1.0.txt",
"reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-1.0",
"spdx_license_key": "GPL-1.0-only",
"spdx_url": "https://spdx.org/licenses/GPL-1.0-only",
"start_line": 1606,
"end_line": 1606,
"matched_rule": {
"identifier": "gpl-1.0_15.RULE",
"license_expression": "gpl-1.0",
"licenses": [
"gpl-1.0"
],
"is_license_text": false,
"is_license_notice": false,
"is_license_reference": false,
"is_license_tag": true,
"matcher": "2-aho",
"rule_length": 2,
"matched_length": 2,
"match_coverage": 100.0,
"rule_relevance": 100.0
},
"matched_text": " currentBB->xferPoints[currentBB->num_xfer_points].vr_gpl = -1;"
}
]
26 changes: 24 additions & 2 deletions tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,28 @@ def test_analyzer_get_analysis_for_region_case_incorrect_false_positives(

assert license_match_error_result.analysis == "false-positive"

def test_analyzer_get_analysis_for_region_case_false_positive_true_1(self):

test_file = self.get_test_loc("analyze_for_scan_errors_false_positive_1.json")
license_matches = DataIOJSON.load_json(test_file)

analysis_result = analyzer.AnalysisResult()

analyzer.get_analysis_for_region(license_matches, analysis_result)

assert analysis_result.analysis == "false-positive"

def test_analyzer_get_analysis_for_region_case_false_positive_true_2(self):

test_file = self.get_test_loc("analyze_for_scan_errors_false_positive_2.json")
license_matches = DataIOJSON.load_json(test_file)

analysis_result = analyzer.AnalysisResult()

analyzer.get_analysis_for_region(license_matches, analysis_result)

assert analysis_result.analysis == "false-positive"

def test_analyzer_is_license_case_mixed_text(self):
test_file = self.get_test_loc("analyzer_is_license_case_mix_text.json")
license_matches = DataIOJSON.load_json(test_file)
Expand Down Expand Up @@ -322,7 +344,7 @@ def test_get_error_rule_sub_type_case_multiple_or(self):
license_matches=matches, analysis_result=analysis_result,
is_license_text=False, is_legal=False
)
assert "notice-and-or-except-notice" == analysis_result.error_rule_sub_type
assert "notice-and-or-with-notice" == analysis_result.error_rule_sub_type

def test_get_error_rule_sub_type_case_single_key(self):
test_file = self.get_test_loc("get_error_rule_sub_type_case_notice_single_key.json")
Expand All @@ -341,7 +363,7 @@ def test_get_license_notice_sub_type_case_notice_unknown(self):
matches = DataIOJSON.load_json(test_file)

result_notice_sub_type = analyzer.get_license_notice_sub_type(
license_matches=matches
license_matches=matches, analysis="is_license_notice"
)
assert "notice-has-unknown-match" == result_notice_sub_type

Expand Down

0 comments on commit aff7cce

Please sign in to comment.