Skip to content

Commit

Permalink
Add extra heuristic for false-positives aboutcode-org#29
Browse files Browse the repository at this point in the history
Adds an extra heuristic for false positives detection, based on
line number and rule length, and their related tests.

Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Jan 27, 2021
1 parent 1174844 commit 208449c
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 10 deletions.
41 changes: 32 additions & 9 deletions src/results_analyze/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
# (i.e. and therefore, different rule)
LINES_THRESHOLD = 4

# Threshold Values of start line and rule length for a match to likely be a false positive
# (more than the start_line threshold and less than the rule_length threshold)
FALSE_POSITIVE_START_LINE_THRESHOLD = 1000
FALSE_POSITIVE_RULE_LENGTH_THRESHOLD = 3

# Whether to Use the NLP BERT Models
USE_LICENSE_CASE_BERT_MODEL = False
USE_FALSE_POSITIVE_BERT_MODEL = False
Expand Down Expand Up @@ -107,14 +112,17 @@ class AnalysisResult:
"license references with unknown licenses detected i.e. fragments of "
"known license text"
),
"notice-false-positive": (
"A piece of code/text is incorrectly detected as a license notice"
),
# `tag` sub-cases
"tag-tag-coverage": "a part of a license tag is detected",
"tag-other-tag-structures": (
"a new/common structure of tags are detected with scope for being "
"handled differently"
),
"tag-false-positive": (
"A piece of code/text is incorrectly detected as a license"
"A piece of code/text is incorrectly detected as a license tag"
),
# `reference` sub-cases
"reference-lead-in-or-unknown-refs": (
Expand All @@ -125,6 +133,9 @@ class AnalysisResult:
"matched to an unknown rule as the license information is present in "
"another file, which is referred to in this matched piece of text"
),
"reference-false-positive": (
"A piece of code/text is incorrectly detected as a license reference"
),
}

def __init__(self):
Expand Down Expand Up @@ -264,9 +275,17 @@ def is_false_positive(license_matches):
:param license_matches: list
List of license matches in a file-region.
"""
match_rule_length_values = (
start_line_region = min(match["start_line"] for match in license_matches)
match_rule_length_values = [
match["matched_rule"]["rule_length"] for match in license_matches
)
]

if start_line_region > FALSE_POSITIVE_START_LINE_THRESHOLD and any(
match_rule_length_value < FALSE_POSITIVE_RULE_LENGTH_THRESHOLD
for match_rule_length_value in match_rule_length_values
):
return True

match_is_license_tag_flags = (
match["matched_rule"]["is_license_tag"] for match in license_matches
)
Expand Down Expand Up @@ -388,15 +407,17 @@ def get_license_text_sub_type(is_license_text, is_legal):
return "text-lic-text-fragments"


def get_license_notice_sub_type(license_matches):
def get_license_notice_sub_type(license_matches, analysis):

license_expression_connectors = ["AND", "OR", "WITH"]

match_rule_license_expressions = [
match["matched_rule"]["license_expression"] for match in license_matches
]

if all(
if analysis == "false-positive":
return "notice-false-positive"
elif all(
any(
license_expression_connector in license_expression
for license_expression_connector in license_expression_connectors
Expand All @@ -421,13 +442,15 @@ def get_license_tag_sub_type(analysis):
return "tag-tag-coverage"


def get_license_reference_sub_type(license_matches):
def get_license_reference_sub_type(license_matches, analysis):

match_rule_identifiers = [
match["matched_rule"]["identifier"] for match in license_matches
]

if any("lead" in identifier for identifier in match_rule_identifiers):
if analysis == "false-positive":
return "reference-false-positive"
elif any("lead" in identifier for identifier in match_rule_identifiers):
return "reference-lead-in-refs"
elif any("unknown" in identifier for identifier in match_rule_identifiers):
return "reference-has-unknown-match"
Expand All @@ -444,15 +467,15 @@ def get_error_rule_sub_type(
)
elif analysis_result.error_rule_type == "is_license_notice":
analysis_result.error_rule_sub_type = get_license_notice_sub_type(
license_matches
license_matches, analysis_result.analysis
)
elif analysis_result.error_rule_type == "is_license_tag":
analysis_result.error_rule_sub_type = get_license_tag_sub_type(
analysis_result.analysis
)
elif analysis_result.error_rule_type == "is_license_reference":
analysis_result.error_rule_sub_type = get_license_reference_sub_type(
license_matches
license_matches, analysis_result.analysis
)


Expand Down
35 changes: 35 additions & 0 deletions tests/data/analyzer/analyze_for_scan_errors_false_positive_1.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"key": "gpl-1.0-plus",
"score": 100.0,
"name": "GNU General Public License 1.0 or later",
"short_name": "GPL 1.0 or later",
"category": "Copyleft",
"is_exception": false,
"owner": "Free Software Foundation (FSF)",
"homepage_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
"text_url": "http://www.gnu.org/licenses/old-licenses/gpl-1.0-standalone.html",
"reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-1.0-plus",
"spdx_license_key": "GPL-1.0-or-later",
"spdx_url": "https://spdx.org/licenses/GPL-1.0-or-later",
"start_line": 8818,
"end_line": 8820,
"matched_rule": {
"identifier": "gpl_234.RULE",
"license_expression": "gpl-1.0-plus",
"licenses": [
"gpl-1.0-plus"
],
"is_license_text": false,
"is_license_notice": true,
"is_license_reference": false,
"is_license_tag": false,
"matcher": "2-aho",
"rule_length": 3,
"matched_length": 3,
"match_coverage": 100.0,
"rule_relevance": 100.0
},
"matched_text": "- target audience: general public\n\nLicense Changes"
}
]
35 changes: 35 additions & 0 deletions tests/data/analyzer/analyze_for_scan_errors_false_positive_2.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
[
{
"key": "gpl-1.0",
"score": 100.0,
"name": "GNU General Public License 1.0",
"short_name": "GPL 1.0",
"category": "Copyleft",
"is_exception": false,
"owner": "Free Software Foundation (FSF)",
"homepage_url": "http://www.gnu.org/licenses/gpl-1.0.html",
"text_url": "http://www.gnu.org/licenses/gpl-1.0.txt",
"reference_url": "https://enterprise.dejacode.com/urn/urn:dje:license:gpl-1.0",
"spdx_license_key": "GPL-1.0-only",
"spdx_url": "https://spdx.org/licenses/GPL-1.0-only",
"start_line": 1606,
"end_line": 1606,
"matched_rule": {
"identifier": "gpl-1.0_15.RULE",
"license_expression": "gpl-1.0",
"licenses": [
"gpl-1.0"
],
"is_license_text": false,
"is_license_notice": false,
"is_license_reference": false,
"is_license_tag": true,
"matcher": "2-aho",
"rule_length": 2,
"matched_length": 2,
"match_coverage": 100.0,
"rule_relevance": 100.0
},
"matched_text": " currentBB->xferPoints[currentBB->num_xfer_points].vr_gpl = -1;"
}
]
24 changes: 23 additions & 1 deletion tests/test_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,28 @@ def test_analyzer_get_analysis_for_region_case_incorrect_false_positives(

assert license_match_error_result.analysis == "false-positive"

def test_analyzer_get_analysis_for_region_case_false_positive_true_1(self):

test_file = self.get_test_loc("analyze_for_scan_errors_false_positive_1.json")
license_matches = DataIOJSON.load_json(test_file)

analysis_result = analyzer.AnalysisResult()

analyzer.get_analysis_for_region(license_matches, analysis_result)

assert analysis_result.analysis == "false-positive"

def test_analyzer_get_analysis_for_region_case_false_positive_true_2(self):

test_file = self.get_test_loc("analyze_for_scan_errors_false_positive_2.json")
license_matches = DataIOJSON.load_json(test_file)

analysis_result = analyzer.AnalysisResult()

analyzer.get_analysis_for_region(license_matches, analysis_result)

assert analysis_result.analysis == "false-positive"

def test_analyzer_is_license_case_mixed_text(self):
test_file = self.get_test_loc("analyzer_is_license_case_mix_text.json")
license_matches = DataIOJSON.load_json(test_file)
Expand Down Expand Up @@ -341,7 +363,7 @@ def test_get_license_notice_sub_type_case_notice_unknown(self):
matches = DataIOJSON.load_json(test_file)

result_notice_sub_type = analyzer.get_license_notice_sub_type(
license_matches=matches
license_matches=matches, analysis="is_license_notice"
)
assert "notice-has-unknown-match" == result_notice_sub_type

Expand Down

0 comments on commit 208449c

Please sign in to comment.