Skip to content

Commit

Permalink
Enable unknown license detection by default
Browse files Browse the repository at this point in the history
Signed-off-by: Ayan Sinha Mahapatra <[email protected]>
  • Loading branch information
AyanSinhaMahapatra committed Oct 27, 2023
1 parent b1a3620 commit 9cca820
Show file tree
Hide file tree
Showing 14 changed files with 98 additions and 47 deletions.
3 changes: 2 additions & 1 deletion etc/scripts/licenses/synclic.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
import licensedcode
from licensedcode.cache import get_licenses_by_spdx_key
from licensedcode import models
from licensedcode.match import MATCH_HASH
from licensedcode.models import load_licenses
from licensedcode.models import License

Expand Down Expand Up @@ -253,7 +254,7 @@ def get_match(text):
len(matches) == 1
and rule.is_from_license
and len(rule_licenses) == 1
and match.matcher == "1-hash"
and match.matcher == MATCH_HASH
and match.score() == 100
and match.len() == query_len
)
Expand Down
81 changes: 67 additions & 14 deletions src/licensedcode/detection.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,11 @@
from licensedcode.cache import build_spdx_license_expression
from licensedcode.match import LicenseMatch
from licensedcode.match import set_matched_lines
from licensedcode.match import MATCH_UNKNOWN
from licensedcode.match import MATCH_UNDETECTED
from licensedcode.match import MATCH_HASH
from licensedcode.match import MATCH_AHO_EXACT
from licensedcode.match import MATCH_SPDX_ID
from licensedcode.models import UnDetectedRule
from licensedcode.models import compute_relevance
from licensedcode.models import Rule
Expand Down Expand Up @@ -69,7 +74,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCHER_UNDETECTED = '5-undetected'

# All values of match_coverage less than this value then they are not considered
# as perfect detections
Expand Down Expand Up @@ -105,6 +109,7 @@ class DetectionCategory(Enum):
PACKAGE_ADD_FROM_FILE = 'from-package-file'
EXTRA_WORDS = 'extra-words'
UNKNOWN_MATCH = 'unknown-match'
UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match'
LICENSE_CLUES = 'license-clues'
LOW_QUALITY_MATCH_FRAGMENTS = 'low-quality-matches'
IMPERFECT_COVERAGE = 'imperfect-match-coverage'
Expand Down Expand Up @@ -133,6 +138,7 @@ class DetectionRule(Enum):
CONTAINED_SAME_LICENSE = 'contained-with-same-license'
UNVERSIONED_FOLLOWED_BY_VERSIONED = 'un-versioned-followed-by-versioned'
UNDETECTED_LICENSE = 'undetected-license'
UNKNOWN_NGRAMS_MATCH = 'unknown-ngrams-match'
PACKAGE_UNKNOWN_REFERENCE_TO_LOCAL_FILE = 'package-unknown-reference-to-local-file'
PACKAGE_ADD_FROM_SIBLING_FILE = 'from-package-sibling-file'
PACKAGE_ADD_FROM_FILE = 'from-package-file'
Expand Down Expand Up @@ -961,10 +967,17 @@ def is_undetected_license_matches(license_matches):
if len(license_matches) != 1:
return False

if license_matches[0].matcher == MATCHER_UNDETECTED:
if license_matches[0].matcher == MATCH_UNDETECTED:
return True


def is_ngrams_unknown_license_matches(license_matches):
return all([
license_match.matcher == MATCH_UNKNOWN
for license_match in license_matches
])


def is_correct_detection_non_unknown(license_matches):
"""
Return True if all the matches in ``license_matches`` List of LicenseMatch
Expand All @@ -988,7 +1001,7 @@ def is_correct_detection(license_matches):
]

return (
all(matcher in ("1-hash", "1-spdx-id", "2-aho") for matcher in matchers)
all(matcher in (MATCH_HASH, MATCH_SPDX_ID, MATCH_AHO_EXACT) for matcher in matchers)
and all(is_match_coverage_perfect)
)

Expand Down Expand Up @@ -1309,14 +1322,19 @@ def get_detected_license_expression(
)

matches_for_expression = None
combined_expression = None
detection_log = []

if analysis == DetectionCategory.FALSE_POSITVE.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionRule.FALSE_POSITIVE.value}')
detection_log.append(DetectionRule.FALSE_POSITIVE.value)
return detection_log, combined_expression
return detection_log, None

elif analysis == DetectionCategory.UNKNOWN_NGRAMS_MATCH.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.UNKNOWN_NGRAMS_MATCH.value}')
matches_for_expression = license_matches
detection_log.append(DetectionRule.UNKNOWN_NGRAMS_MATCH.value)

elif analysis == DetectionCategory.UNDETECTED_LICENSE.value:
if TRACE_ANALYSIS:
Expand Down Expand Up @@ -1377,15 +1395,15 @@ def get_detected_license_expression(
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
detection_log.append(DetectionRule.LICENSE_CLUES.value)
return detection_log, combined_expression
return detection_log, None

elif analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
if TRACE_ANALYSIS:
logger_debug(f'analysis {DetectionCategory.LICENSE_CLUES.value}')
# TODO: we are temporarily returning these as license clues, and not
# in detections but ideally we should return synthetic unknowns for these
detection_log.append(DetectionRule.LOW_QUALITY_MATCH_FRAGMENTS.value)
return detection_log, combined_expression
return detection_log, None

else:
if TRACE_ANALYSIS:
Expand Down Expand Up @@ -1454,7 +1472,7 @@ def get_undetected_matches(query_string):
ispan=ispan,
hispan=hispan,
query_run_start=match_start,
matcher=MATCHER_UNDETECTED,
matcher=MATCH_UNDETECTED,
query=query_run.query,
)

Expand Down Expand Up @@ -1510,7 +1528,10 @@ def get_ambiguous_license_detections_by_type(unique_license_detections):

elif is_undetected_license_matches(license_matches=detection.matches):
ambi_license_detections[DetectionCategory.UNDETECTED_LICENSE.value] = detection


elif is_ngrams_unknown_license_matches(license_matches=detection.matches):
ambi_license_detections[DetectionCategory.UNKNOWN_NGRAMS_MATCH.value] = detection

elif has_correct_license_clue_matches(license_matches=detection.matches):
ambi_license_detections[DetectionCategory.LICENSE_CLUES.value] = detection

Expand Down Expand Up @@ -1542,7 +1563,10 @@ def analyze_detection(license_matches, package_license=False):
if TRACE:
logger_debug(f'license_matches {license_matches}', f'package_license {package_license}')

if is_undetected_license_matches(license_matches=license_matches):
if is_ngrams_unknown_license_matches(license_matches=license_matches):
return DetectionCategory.UNKNOWN_NGRAMS_MATCH.value

elif is_undetected_license_matches(license_matches=license_matches):
return DetectionCategory.UNDETECTED_LICENSE.value

elif has_unknown_intro_before_detection(license_matches=license_matches):
Expand Down Expand Up @@ -1593,6 +1617,20 @@ def analyze_detection(license_matches, package_license=False):
return DetectionCategory.PERFECT_DETECTION.value


def has_low_quality_matches(license_matches):
"""
Given a list of ``license_matches`` LicenseMatch objects, return True if
any of the LicenseMatch object is a low quality match, otherwise return
False.
"""
for group_of_matches in group_matches(license_matches=license_matches):
analysis = analyze_detection(license_matches=group_of_matches,)
if analysis == DetectionCategory.LOW_QUALITY_MATCH_FRAGMENTS.value:
return True

return False


def group_matches(license_matches, lines_threshold=LINES_THRESHOLD):
"""
Given a list of ``license_matches`` LicenseMatch objects, yield lists of
Expand Down Expand Up @@ -1746,7 +1784,6 @@ def detect_licenses(
analysis=None,
post_scan=False,
package_license=False,
unknown_licenses=False,
min_score=0,
deadline=sys.maxsize,
as_expression=False,
Expand Down Expand Up @@ -1781,12 +1818,28 @@ def detect_licenses(
min_score=min_score,
deadline=deadline,
as_expression=as_expression,
unknown_licenses=unknown_licenses,
unknown_licenses=False,
**kwargs,
)

if not license_matches:
return
# TODO: Instead of analysing all matches once more, and then matching the
# whole query with unknown license detection on, we should get query runs
# for only the matches with low quality matches and then run the specific
# unknown license matching on those parts (outcome would be same, but with
# better performance)
if has_low_quality_matches(license_matches) or not license_matches:
unknown_license_matches = index.match(
location=location,
query_string=query_string,
min_score=min_score,
deadline=deadline,
unknown_licenses=True,
**kwargs,
)
if not unknown_license_matches:
return

license_matches = unknown_license_matches

if TRACE:
logger_debug(f"detection: detect_licenses: location: {location}: query_string: {query_string}")
Expand Down
12 changes: 11 additions & 1 deletion src/licensedcode/match.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,16 @@
TRACE_REPR_ALL_MATCHED_TEXTS = False


# All matchers
MATCH_SPDX_ID = '1-spdx-id'
MATCH_HASH = '1-hash'
MATCH_AHO_EXACT = '2-aho'
MATCH_SEQ = '3-seq'
MATCH_UNDETECTED = '5-undetected'
MATCH_AHO_FRAG = '5-aho-frag'
MATCH_UNKNOWN = '6-unknown'


def logger_debug(*args): pass


Expand Down Expand Up @@ -2606,7 +2616,7 @@ def is_candidate_false_positive(
# only tags or refs,
(match.rule.is_license_reference or match.rule.is_license_tag or match.rule.is_license_intro)
# but not tags that are SPDX license identifiers
and not match.matcher == '1-spdx-id'
and not match.matcher == MATCH_SPDX_ID
# exact matches only
and match.coverage() == 100

Expand Down
6 changes: 2 additions & 4 deletions src/licensedcode/match_aho.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@

from licensedcode import SMALL_RULE
from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_AHO_EXACT
from licensedcode.match import MATCH_AHO_FRAG
from licensedcode.spans import Span

"""
Expand Down Expand Up @@ -75,10 +77,6 @@ def add_sequence(automaton, tids, rid, start=0, with_duplicates=False):
automaton.add_word(tokens, [value])


MATCH_AHO_EXACT = '2-aho'
MATCH_AHO_FRAG = '5-aho-frag'


def exact_match(idx, query_run, automaton, matcher=MATCH_AHO_EXACT, **kwargs):
"""
Return a list of exact LicenseMatch by matching the `query_run` against
Expand Down
3 changes: 1 addition & 2 deletions src/licensedcode/match_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_HASH
from licensedcode.spans import Span

"""
Expand All @@ -38,8 +39,6 @@ def logger_debug(*args):
def logger_debug(*args):
pass

MATCH_HASH = '1-hash'


def tokens_hash(tokens):
"""
Expand Down
3 changes: 1 addition & 2 deletions src/licensedcode/match_seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@


from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_SEQ
from licensedcode.spans import Span


Expand Down Expand Up @@ -44,8 +45,6 @@ def logger_debug(*args):
like approaches.
"""

MATCH_SEQ = '3-seq'


def match_sequence(idx, rule, query_run, high_postings, start_offset=0,
match_blocks=None, deadline=sys.maxsize):
Expand Down
3 changes: 1 addition & 2 deletions src/licensedcode/match_spdx_lid.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from license_expression import Licensing

from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_SPDX_ID
from licensedcode.models import SpdxRule
from licensedcode.spans import Span
from textcode.markup import is_markup_text
Expand Down Expand Up @@ -58,8 +59,6 @@ def logger_debug(*args):
def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, str) and a or repr(a) for a in args))

MATCH_SPDX_ID = '1-spdx-id'


def spdx_id_match(idx, query_run, text, expression_symbols=None):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/licensedcode/match_unknown.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from licensedcode.models import UnknownRule
from licensedcode.match import get_full_qspan_matched_text
from licensedcode.match import LicenseMatch
from licensedcode.match import MATCH_UNKNOWN
from licensedcode.spans import Span

"""
Expand Down Expand Up @@ -43,7 +44,6 @@ def logger_debug(*args):
def logger_debug(*args):
pass

MATCH_UNKNOWN = '6-unknown'

UNKNOWN_NGRAM_LENGTH = 6

Expand Down
9 changes: 0 additions & 9 deletions src/licensedcode/plugin_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,13 +128,6 @@ class LicenseScanner(ScanPlugin):
help_group=SCAN_OPTIONS_GROUP,
),

PluggableCommandLineOption(
('--unknown-licenses',),
is_flag=True,
required_options=['license'],
help='[EXPERIMENTAL] Detect unknown licenses. ',
help_group=SCAN_OPTIONS_GROUP,
)
]

def is_enabled(self, license, **kwargs): # NOQA
Expand All @@ -155,7 +148,6 @@ def get_scanner(
license_text_diagnostics=False,
license_diagnostics=False,
license_url_template=SCANCODE_LICENSEDB_URL,
unknown_licenses=False,
**kwargs
):

Expand All @@ -166,7 +158,6 @@ def get_scanner(
license_text_diagnostics=license_text_diagnostics,
license_diagnostics=license_diagnostics,
license_url_template=license_url_template,
unknown_licenses=unknown_licenses,
)

def process_codebase(self, codebase, license_diagnostics, **kwargs):
Expand Down
4 changes: 3 additions & 1 deletion src/licensedcode/tracing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@
from functools import partial
import textwrap

from licensedcode.match import MATCH_UNKNOWN

"""
Utility function to trace matched texts used for tracing and testing.
"""
Expand All @@ -27,7 +29,7 @@ def get_texts(match, width=80, margin=0):
"""
qtokens = match.matched_text(whole_lines=False).split()
mqt = format_text(tokens=qtokens, width=width, margin=margin)
if match.matcher == '6-unknown':
if match.matcher == MATCH_UNKNOWN:
itokens = match.rule.text.split()
else:
itokens = matched_rule_tokens_str(match)
Expand Down
Loading

0 comments on commit 9cca820

Please sign in to comment.