diff --git a/etc/scripts/genlicspdx.py b/etc/scripts/genlicspdx.py new file mode 100644 index 00000000000..c4355844759 --- /dev/null +++ b/etc/scripts/genlicspdx.py @@ -0,0 +1,113 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) 2019 nexB Inc. and others. All rights reserved. +# http://nexb.com and https://github.com/nexB/scancode-toolkit/ +# The ScanCode software is licensed under the Apache License version 2.0. +# Data generated with ScanCode require an acknowledgment. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# When you publish or redistribute any data created with ScanCode or any ScanCode +# derivative work, you must accompany this data with the following acknowledgment: +# +# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# ScanCode is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode-toolkit/ for support and download. + +from __future__ import absolute_import +from __future__ import unicode_literals +from __future__ import print_function + +import os + +import click +click.disable_unicode_literals_warning = True + +from licensedcode.models import load_licenses +from scancode.cli import run_scan + + +""" +Generate an SPDX document for each license known in ScanCode that are not usted +at SPDX. +Run python genlicspdx.py -h for help. + +NOTE: this is rather inefficient as it is starting a new command line process +for each license, taking a few seconds each time. +Upcomming code to call a scan function instead will be more efficient. +""" + +FOSS_CATEGORIES = set([ + 'Copyleft', + 'Copyleft Limited', + 'Patent License', + 'Permissive', + 'Public Domain', +]) + + +@click.command() +@click.argument('license_dir', + type=click.Path(file_okay=False, exists=True, writable=True, + allow_dash=False, resolve_path=True), + metavar='DIR') +@click.option('-v', '--verbose', is_flag=True, default=False, help='Print execution messages.') +@click.help_option('-h', '--help') +def cli(license_dir, verbose): + """ + Create one SPDX tag-value document for each non-SPDX ScanCode licenses. + Store these in the DIR directory + """ + + base_kwargs = dict( + license=True, license_diag=True, license_text=True, info=True, + strip_root=True, quiet=True, return_results=False) + + licenses_by_key = load_licenses(with_deprecated=False) + + + for i, lic in enumerate(licenses_by_key.values()): + ld = lic.to_dict() + + if lic.spdx_license_key: + if verbose: + click.echo( + 'Skipping ScanCode: {key} that is an SPDX license: {spdx_license_key}'.format(**ld)) + continue + + if not lic.text_file or not os.path.exists(lic.text_file): + if verbose: + click.echo( + 'Skipping license without text: {key}'.format(**ld)) + continue + + if lic.category not in FOSS_CATEGORIES: + if verbose: + click.echo( + 'Skipping non FOSS license: {key}'.format(**ld)) + continue + + output = 'licenseref-scancode-{key}.spdx'.format(**ld) + output = os.path.join(license_dir, output) + + if verbose: + click.echo('Creating SPDX document for license: {key}'.format(**ld)) + click.echo('at: {output}'.format(**locals())) + + with open(output, 'wb') as ouput_file: + kwargs = dict(input=lic.text_file, spdx_tv=ouput_file) + kwargs.update(base_kwargs) + run_scan(**kwargs) + + +if __name__ == '__main__': + cli() diff --git a/etc/scripts/scancli.py b/etc/scripts/scancli.py new file mode 100644 index 00000000000..d5829a421fd --- /dev/null +++ b/etc/scripts/scancli.py @@ -0,0 +1,74 @@ +# +# Copyright (c) 2019 nexB Inc. and others. All rights reserved. +# http://nexb.com and https://github.com/nexB/scancode-toolkit/ +# The ScanCode software is licensed under the Apache License version 2.0. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import json +from os.path import abspath +from os.path import dirname +from os.path import join +from os.path import normpath + +import execnet + +import scanserv + +""" +This is a module designed to be called from Python 2 or 3 and is the client +side. See scanserv for the back server module that runs on Python 2 and runs +effectively scancode. +""" + + +def scan(locations, deserialize=False, scancode_root_dir=None): + """ + Scan the list of paths at `location` and return the results as an iterable + of JSON strings. If `deserialize` is True the iterable contains a python data + instead. + Each location is scanned independently. + """ + if not scancode_root_dir: + scancode_root_dir = abspath(normpath(__file__)) + scancode_root_dir = dirname(dirname(dirname(scancode_root_dir))) + python2 = join(scancode_root_dir, 'bin', 'python') + spec = 'popen//python={python2}'.format(**locals()) + gateway = execnet.makegateway(spec) # NOQA + channel = gateway.remote_exec(scanserv) + + for location in locations: + # build a mapping of options to use for this scan + scan_kwargs = dict( + location=location, + license=True, + license_text=True, + license_diag=True, + copyright=True, + info=True, + processes=0, + ) + + channel.send(scan_kwargs) # execute func-call remotely + results = channel.receive() + if deserialize: + results = json.loads(results) + yield results + + +if __name__ == '__main__': + import sys # NOQA + args = sys.argv[1:] + for s in scan(args): + print(s) diff --git a/etc/scripts/scanserv.README b/etc/scripts/scanserv.README new file mode 100644 index 00000000000..1da44b34312 --- /dev/null +++ b/etc/scripts/scanserv.README @@ -0,0 +1,29 @@ +A simple proof of concept for Python3 remoting with execnet. + +See ticket #1400 for more. + +This is an example of how to call Scancode as a function from Python2 or Python3. +The benefits are that when the server process has loaded the license index, +and imported its modules there is no per-call import/loading penalty anymore. + +This is using execnet which is the multiprocessing library used by +py.test and therefore a rather stable and high quality engine. + +To test, do this:: + +1. checkout scancode and run ./configure in a first shell. This is for a plain +ScanCode using Python 2 that will be used as a "server". + +2. in another shell, create a virtualenv with Python 3 in another + location. Activate that venv, and `pip install simplejson execnet` + +3. Change dir to the install scancode-toolkit/etc/scripts where the scancli.py +and scancserv.py scripts are. Then run:: + + python3 scancli.py ../../NOTICE ../../setup.py + +This will effectively make remote functions calls to the Python2 +scancode and gets the result in Python3 alright. It also allows to have +multiple calls that reuse the same process, hence amortizing any startup +costs. Here this will run two scans: one on NOTICE and another on setup.py. +It could have been directories too. diff --git a/etc/scripts/scanserv.py b/etc/scripts/scanserv.py new file mode 100644 index 00000000000..fa49bb73605 --- /dev/null +++ b/etc/scripts/scanserv.py @@ -0,0 +1,55 @@ +# +# Copyright (c) 2019 nexB Inc. and others. All rights reserved. +# http://nexb.com and https://github.com/nexB/scancode-toolkit/ +# The ScanCode software is licensed under the Apache License version 2.0. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +""" +Python2 "server side" of the scan server. In a given execnet session, this +process will hold a loaded license index and can be invoked multiple times +without the index load penalty on each call. +""" + + +def as_json(results, pretty=True): + """ + Return a JSON string from a `results` data structuret. + """ + # this is used for its ability to handle iterables as arrays. + import simplejson + + kwargs = dict(iterable_as_array=True, encoding='utf-8') + if pretty: + kwargs.update(dict(indent=2 * b' ')) + else: + kwargs.update(dict(separators=(b',', b':',))) + return simplejson.dumps(results, **kwargs) + b'\n' + + +def run_scan(location, **kwargs): + from scancode import cli + pretty = kwargs.pop('pretty', True) + return as_json(cli.run_scan(location, **kwargs), pretty=pretty) + + +if __name__ == '__channelexec__': + for kwargs in channel: # NOQA + # a mapping of kwargs or a location string + if isinstance(kwargs, (str, unicode)): + channel.send(run_scan(kwargs)) # NOQA + elif isinstance(kwargs, dict): + channel.send(run_scan(**kwargs)) # NOQA + else: + raise Exception('Unknown arguments type: ' + repr(kwargs)) diff --git a/src/commoncode/fileset.py b/src/commoncode/fileset.py index f01c6db5ae4..aad58783e57 100644 --- a/src/commoncode/fileset.py +++ b/src/commoncode/fileset.py @@ -28,21 +28,26 @@ import fnmatch import os -import logging from commoncode import fileutils from commoncode import paths from commoncode.system import on_linux -DEBUG = False -logger = logging.getLogger(__name__) -# import sys -# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) -# logger.setLevel(logging.DEBUG) + +TRACE = False +if TRACE: + import logging + import sys + + logger = logging.getLogger(__name__) + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) + logger.setLevel(logging.DEBUG) + POSIX_PATH_SEP = b'/' if on_linux else '/' EMPTY_STRING = b'' if on_linux else '' + """ Match files and directories paths based on inclusion and exclusion glob-style patterns. @@ -79,33 +84,44 @@ """ -def match(path, includes, excludes): +def is_included(path, includes=None, excludes=None): """ - Return a matching pattern value (e.g. a reason message) or False if `path` is matched or not. - If the `path` is empty, return False. + Return a True if `path` is included based on mapping of `includes` and + `excludes` glob patterns. If the `path` is empty, return False. Matching is done based on the set of `includes` and `excludes` patterns maps - of {fnmtch pattern -> value} where value can be a message string or some other - object. - The order of the includes and excludes items does not matter and if a map is - empty , it is not used for matching. + of {fnmatch pattern: message}. If `includes` are provided they are tested + first. The `excludes` are tested second if provided. + + The ordering of the includes and excludes items does not matter and if a map + is empty, it is not used for matching. """ - includes = includes or {} - excludes = excludes or {} if not path or not path.strip(): return False - included = get_matches(path, includes, all_matches=False) - excluded = get_matches(path, excludes, all_matches=False) - if DEBUG: - logger.debug('in_fileset: path: %(path)r included:%(included)r, ' - 'excluded:%(excluded)r .' % locals()) - if excluded: - return False - elif included: - return included - else: - return False + if not includes and not excludes: + return True + + includes = includes or {} + includes = {k: v for k, v in includes.items() if k} + excludes = excludes or {} + excludes = {k: v for k, v in excludes.items() if k} + + if includes: + included = get_matches(path, includes, all_matches=False) + if TRACE: + logger.debug('in_fileset: path: %(path)r included:%(included)r' % locals()) + if not included: + return False + + if excludes: + excluded = get_matches(path, excludes, all_matches=False) + if TRACE: + logger.debug('in_fileset: path: %(path)r excluded:%(excluded)r .' % locals()) + if excluded: + return False + + return True def get_matches(path, patterns, all_matches=False): @@ -122,13 +138,17 @@ def get_matches(path, patterns, all_matches=False): pathstripped = path.lstrip(POSIX_PATH_SEP) if not pathstripped: return False + segments = paths.split(pathstripped) - if DEBUG: + + if TRACE: logger.debug('_match: path: %(path)r patterns:%(patterns)r.' % locals()) + matches = [] if not isinstance(patterns, dict): assert isinstance(patterns, (list, tuple)), 'Invalid patterns: {}'.format(patterns) patterns = {p: p for p in patterns} + for pat, value in patterns.items(): if not pat or not pat.strip(): continue @@ -146,8 +166,9 @@ def get_matches(path, patterns, all_matches=False): matches.append(value) if not all_matches: break - if DEBUG: + if TRACE: logger.debug('_match: matches: %(matches)r' % locals()) + if not all_matches: if matches: return matches[0] @@ -183,6 +204,7 @@ def includes_excludes(patterns, message): excluded = {} if not patterns: return included, excluded + for pat in patterns: pat = pat.strip() if not pat or pat.startswith(POUND): diff --git a/src/commoncode/fileutils.py b/src/commoncode/fileutils.py index dd8c8494ba4..5debd750582 100644 --- a/src/commoncode/fileutils.py +++ b/src/commoncode/fileutils.py @@ -91,6 +91,8 @@ def logger_debug(*args): ALL_SEPS = POSIX_PATH_SEP + WIN_PATH_SEP EMPTY_STRING = b'' if on_linux else '' DOT = b'.' if on_linux else '.' +PATH_SEP = bytes(os.sep) if on_linux else unicode(os.sep) + """ File, paths and directory utility functions. diff --git a/src/commoncode/ignore.py b/src/commoncode/ignore.py index d04e4892342..58f9e8831f4 100644 --- a/src/commoncode/ignore.py +++ b/src/commoncode/ignore.py @@ -44,7 +44,7 @@ def is_ignored(location, ignores, unignores=None, skip_special=True): """ if skip_special and filetype.is_special(location): return True - return fileset.match(location, includes=ignores, excludes=unignores) + return not fileset.is_included(location, includes=unignores, excludes=ignores) def is_ignore_file(location): diff --git a/src/formattedcode/output_json.py b/src/formattedcode/output_json.py index 220dc3802f8..926ba86acca 100644 --- a/src/formattedcode/output_json.py +++ b/src/formattedcode/output_json.py @@ -76,8 +76,8 @@ def is_enabled(self, output_json, **kwargs): return output_json def process_codebase(self, codebase, output_json, **kwargs): - files = self.get_files(codebase, **kwargs) - write_json(codebase, files, output_file=output_json, pretty=False) + results = get_results(codebase, as_list=False, **kwargs) + write_json(results, output_file=output_json, pretty=False) @output_impl @@ -96,35 +96,47 @@ def is_enabled(self, output_json_pp, **kwargs): return output_json_pp def process_codebase(self, codebase, output_json_pp, **kwargs): - files = self.get_files(codebase, **kwargs) - write_json(codebase, files, output_file=output_json_pp, pretty=True, **kwargs) + results = get_results(codebase, as_list=False, **kwargs) + write_json(results, output_file=output_json_pp, pretty=True) -def write_json(codebase, files, output_file, - include_summary=False, include_score=False, - pretty=False, **kwargs): - # NOTE: we write as binary, not text +def write_json(results, output_file, pretty=False, **kwargs): + """ + Write `results` to the `output_file` opened file-like object. + """ + # NOTE: we write as encoded, binary bytes, not as unicode, decoded text + kwargs = dict(iterable_as_array=True, encoding='utf-8') + if pretty: + kwargs.update(dict(indent=2 * b' ')) + else: + kwargs.update(dict(separators=(b',', b':',))) + output_file.write(simplejson.dumps(results, **kwargs)) + output_file.write(b'\n') + + +def get_results(codebase, as_list=False, **kwargs): + """ + Return an ordered mapping of scan results collected from a `codebase`. + if `as_list` consume the "files" iterator in a list sequence. + """ codebase.add_files_count_to_current_header() - scan = OrderedDict([(b'headers', codebase.get_headers()), ]) + results = OrderedDict([('headers', codebase.get_headers()), ]) # add codebase toplevel attributes such as summaries if codebase.attributes: - scan.update(codebase.attributes.to_dict()) + results.update(codebase.attributes.to_dict()) + + files = OutputPlugin.get_files(codebase, **kwargs) + if as_list: + files = list(files) + results['files'] = files if TRACE: - logger_debug('write_json: files') + logger_debug('get_results: files') files = list(files) from pprint import pformat logger_debug(pformat(files)) - scan[b'files'] = files - - kwargs = dict(iterable_as_array=True, encoding='utf-8') - if pretty: - kwargs.update(dict(indent=2 * b' ')) - else: - kwargs.update(dict(separators=(b',', b':',))) + return results - output_file.write(simplejson.dumps(scan, **kwargs)) - output_file.write(b'\n') diff --git a/src/formattedcode/output_jsonlines.py b/src/formattedcode/output_jsonlines.py index 8914e64ef4c..770988b667c 100644 --- a/src/formattedcode/output_jsonlines.py +++ b/src/formattedcode/output_jsonlines.py @@ -53,6 +53,7 @@ class JsonLinesOutput(OutputPlugin): def is_enabled(self, output_json_lines, **kwargs): return output_json_lines + # TODO: reuse the json output code and merge that in a single plugin def process_codebase(self, codebase, output_json_lines, **kwargs): #NOTE: we write as binary, not text files = self.get_files(codebase, **kwargs) diff --git a/src/plugincode/__init__.py b/src/plugincode/__init__.py index 7ce3ec12d5a..74e7bab3779 100644 --- a/src/plugincode/__init__.py +++ b/src/plugincode/__init__.py @@ -189,8 +189,9 @@ def load_plugins(cls): for stage, manager in cls.managers.items(): mgr_setup = manager.setup() if not mgr_setup: + from scancode import ScancodeError msg = 'Cannot load ScanCode plugins for stage: %(stage)s' % locals() - raise Exception(msg) + raise ScancodeError(msg) mplugin_classes, mplugin_options = mgr_setup plugin_classes.extend(mplugin_classes) plugin_options.extend(mplugin_options) @@ -202,7 +203,7 @@ def setup(self): all plugin classes). Load and validate available plugins for this PluginManager from its - assigned `entrypoint`. Raise an Exception if a plugin is not valid such + assigned `entrypoint`. Raise a ScancodeError if a plugin is not valid such that when it does not subcclass the manager `plugin_base_class`. Must be called once to setup the plugins of this manager. """ @@ -215,7 +216,7 @@ def setup(self): entrypoint = self.entrypoint try: self.manager.load_setuptools_entrypoints(entrypoint) - except ImportError, e: + except ImportError as e: raise e stage = self.stage @@ -227,7 +228,8 @@ def setup(self): if not issubclass(plugin_class, self.plugin_base_class): qname = '%(stage)s:%(name)s' % locals() plugin_base_class = self.plugin_base_class - raise Exception( + from scancode import ScancodeError #NOQA + raise ScancodeError( 'Invalid plugin: %(qname)r: %(plugin_class)r ' 'must extend %(plugin_base_class)r.' % locals()) @@ -236,7 +238,8 @@ def setup(self): qname = '%(stage)s:%(name)s' % locals() oname = option.name clin = CommandLineOption - raise Exception( + from scancode import ScancodeError #NOQA + raise ScancodeError( 'Invalid plugin: %(qname)r: option %(oname)r ' 'must extend %(clin)r.' % locals()) plugin_options.append(option) diff --git a/src/scancode/__init__.py b/src/scancode/__init__.py index 271c66ae7ec..0115e81fcc6 100644 --- a/src/scancode/__init__.py +++ b/src/scancode/__init__.py @@ -72,6 +72,15 @@ def logger_debug(*args): return logger.debug(' '.join(isinstance(a, (unicode, str)) and a or repr(a) for a in args)) + +class ScancodeError(Exception): + """Base exception for scancode errors""" + + +class ScancodeCliUsageError(ScancodeError, click.UsageError): + """Exception for command line usage errors""" + + # CLI help groups SCAN_GROUP = 'primary scans' SCAN_OPTIONS_GROUP = 'scan options' diff --git a/src/scancode/cli.py b/src/scancode/cli.py index abc80a72a30..4a4e088845e 100644 --- a/src/scancode/cli.py +++ b/src/scancode/cli.py @@ -37,6 +37,7 @@ from collections import OrderedDict from functools import partial from itertools import imap +import os import sys from time import time import traceback @@ -47,7 +48,9 @@ # import early from scancode_config import __version__ as scancode_version +from commoncode.fileutils import as_posixpath from commoncode.fileutils import PATH_TYPE +from commoncode.fileutils import POSIX_PATH_SEP from commoncode.timeutils import time2tstamp from plugincode import PluginManager @@ -59,6 +62,8 @@ from plugincode import output_filter from plugincode import output +from scancode import ScancodeError +from scancode import ScancodeCliUsageError from scancode import CORE_GROUP from scancode import DOC_GROUP from scancode import MISC_GROUP @@ -260,7 +265,7 @@ def print_options(ctx, param, value): @click.pass_context # ensure that the input path is bytes on Linux, unicode elsewhere -@click.argument('input', metavar=' ', +@click.argument('input', metavar=' ...', nargs=-1, type=click.Path(exists=True, readable=True, path_type=PATH_TYPE)) @click.option('--strip-root', @@ -382,6 +387,7 @@ def scancode(ctx, input, # NOQA max_in_memory, test_mode, keep_temp_files, + echo_func=echo_stderr, *args, **kwargs): """scan the file or directory for license, origin and packages and save results to FILE(s) using one or more output format option. @@ -449,6 +455,114 @@ def scancode(ctx, input, # NOQA through Click context machinery. """ + success = False + try: + # Validate CLI UI options dependencies and other CLI-specific inits + if TRACE_DEEP: + logger_debug('scancode: ctx.params:') + for co in sorted(ctx.params.items()): + logger_debug(' scancode: ctx.params:', co) + + validate_option_dependencies(ctx) + pretty_params = get_pretty_params(ctx, generic_paths=test_mode) + + # run proper + success, _results = run_scan( + input=input, + from_json=from_json, + strip_root=strip_root, full_root=full_root, + processes=processes, timeout=timeout, + quiet=quiet, verbose=verbose, + timing=timing, max_in_memory=max_in_memory, + test_mode=test_mode, + keep_temp_files=keep_temp_files, + pretty_params=pretty_params, + # results are saved to file, no need to get them back in a cli context + return_results=False, + echo_func=echo_stderr, + *args, **kwargs) + + except click.UsageError as e: + # this will exit + raise e + + except ScancodeError as se: + # TODO :consider raising a usage error? + echo_func(se.message, color='red') + ctx.exit(2) + + rc = 0 if success else 1 + ctx.exit(rc) + + +def run_scan( + input, # NOQA + from_json=None, + strip_root=False, + full_root=False, + max_in_memory=10000, + processes=1, + timeout=120, + quiet=True, + verbose=False, + echo_func=None, + timing=False, + keep_temp_files=False, + return_results=True, + test_mode=False, + pretty_params=None, + *args, **kwargs): + """ + Run a scan on `input` path (or a list of input paths) and return a tuple of + (success, results) where success is a boolean and results is a list of + "files" items using the same data structure as the "files" in the JSON scan + results but as native Python. Raise Exceptions (e.g. ScancodeError) on + error. See scancode() for arguments details. + """ + + if not echo_func: + def echo_func(*args, **kwargs): pass + + if not isinstance(input, (list, tuple)): + # nothing else todo + assert isinstance(input, (bytes, unicode)) + + elif len(input) == 1: + # we received a single input path, so we treat this as a single path + input = input[0] # NOQA + else: + # we received a several input paths: we can handle this IFF they share + # a common root directory and none is an absolute path + + if any(os.path.isabs(p) for p in input): + msg = ('ERROR: invalid inputs: input paths must be relative and ' + 'share a common parent when using multiple inputs.') + raise ScancodeError(msg + '\n' + traceback.format_exc()) + + # find the common prefix directory (note that this is a pre string operation + # hence it may return non-existing paths + common_prefix = os.path.commonprefix(input) + + if not common_prefix: + # we have no common prefix, but all relative. therefore the + # parent/root is the current ddirectory + common_prefix = PATH_TYPE('.') + + elif not os.path.isdir(common_prefix): + msg = 'ERROR: invalid inputs: all input paths must share a common parent directory.' + raise ScancodeError(msg + '\n' + traceback.format_exc()) + + # and we craft a list of synthetic --include path pattern options from + # the input list of paths + included_paths = [as_posixpath(path).rstrip(POSIX_PATH_SEP) for path in input] + # FIXME: this is a hack as this "include" is from an external plugin!!!1 + include = list(kwargs.get('include', []) or []) + include.extend(included_paths) + kwargs['include'] = include + + # ... and use the common prefix as our new input + input = common_prefix # NOQA + # build mappings of all kwargs to pass down to plugins standard_kwargs = dict( input=input, @@ -466,6 +580,7 @@ def scancode(ctx, input, # NOQA kwargs.update(standard_kwargs) success = True + results = None codebase = None processing_start = time() @@ -473,27 +588,16 @@ def scancode(ctx, input, # NOQA if not quiet: if not processes: - echo_stderr('Disabling multi-processing for debugging.', fg='yellow') + echo_func('Disabling multi-processing for debugging.', fg='yellow') elif processes == -1: - echo_stderr('Disabling multi-processing ' - 'and multi-threading for debugging.', fg='yellow') + echo_func('Disabling multi-processing ' + 'and multi-threading for debugging.', fg='yellow') try: - - ######################################################################## - # Validate UI options deps - ######################################################################## - validate_option_dependencies(ctx) - ######################################################################## # Find and create known plugin instances and collect the enabled ######################################################################## - if TRACE_DEEP: - ctx_params = sorted(ctx.params.items()) - logger_debug('scancode: ctx.params:') - for co in ctx.params: - logger_debug(' scancode: ctx.params:', co) enabled_plugins_by_stage = OrderedDict() all_enabled_plugins_by_qname = {} @@ -506,16 +610,20 @@ def scancode(ctx, input, # NOQA name = plugin_cls.name qname = plugin_cls.qname() plugin = plugin_cls(**kwargs) - if plugin.is_enabled(**kwargs): + is_enabled = False + try: + is_enabled = plugin.is_enabled(**kwargs) + except TypeError as te: + if not 'takes exactly' in str(te): + raise te + if is_enabled: stage_plugins.append(plugin) all_enabled_plugins_by_qname[qname] = plugin else: non_enabled_plugins_by_qname[qname] = plugin except: msg = 'ERROR: failed to load plugin: %(qname)s:' % locals() - echo_stderr(msg, fg='red') - echo_stderr(traceback.format_exc()) - ctx.exit(2) + raise ScancodeError(msg + '\n' + traceback.format_exc()) # NOTE: these are list of plugin instances, not classes! pre_scan_plugins = enabled_plugins_by_stage[pre_scan.stage] @@ -526,12 +634,12 @@ def scancode(ctx, input, # NOQA if from_json and scanner_plugins: msg = ('Data loaded from JSON: no scan options can be selected.') - raise click.UsageError(msg) + raise ScancodeCliUsageError(msg) - if not output_plugins: + if not output_plugins and not return_results: msg = ('Missing output option(s): at least one output ' 'option is required to save scan results.') - raise click.UsageError(msg) + raise ScancodeCliUsageError(msg) ######################################################################## # Get required and enabled plugins instance so we can run their setup @@ -559,7 +667,7 @@ def scancode(ctx, input, # NOQA for qn, requestors in requestors_by_missing_qname.items(): rqs = ', '.join(sorted(requestors)) msg += ' Plugin: {qn} is required by plugins: {rqs}.\n'.format(**locals()) - raise Exception(msg) + raise ScancodeError(msg) if TRACE_DEEP: logger_debug('scancode: plugins_to_setup: from required:', plugins_to_setup) @@ -578,7 +686,7 @@ def scancode(ctx, input, # NOQA plugins_setup_start = time() if not quiet and not verbose: - echo_stderr('Setup plugins...', fg='green') + echo_func('Setup plugins...', fg='green') # TODO: add progress indicator for plugin in plugins_to_setup: @@ -586,15 +694,13 @@ def scancode(ctx, input, # NOQA stage = plugin.stage name = plugin.name if verbose: - echo_stderr(' Setup plugin: %(stage)s:%(name)s...' % locals(), + echo_func(' Setup plugin: %(stage)s:%(name)s...' % locals(), fg='green') try: plugin.setup(**kwargs) except: msg = 'ERROR: failed to setup plugin: %(stage)s:%(name)s:' % locals() - echo_stderr(msg, fg='red') - echo_stderr(traceback.format_exc()) - ctx.exit(2) + raise ScancodeError(msg + '\n' + traceback.format_exc()) timing_key = 'setup_%(stage)s:%(name)s' % locals() setup_timings[timing_key] = time() - plugin_setup_start @@ -621,9 +727,7 @@ def scancode(ctx, input, # NOQA except: msg = ('ERROR: failed to collect resource_attributes for plugin: ' '%(stage)s:%(name)s:' % locals()) - echo_stderr(msg, fg='red') - echo_stderr(traceback.format_exc()) - ctx.exit(2) + raise ScancodeError(msg + '\n' + traceback.format_exc()) resource_attributes = OrderedDict() for _, name, attribs in sorted(sortable_resource_attributes): @@ -659,9 +763,7 @@ def scancode(ctx, input, # NOQA except: msg = ('ERROR: failed to collect codebase_attributes for plugin: ' '%(stage)s:%(name)s:' % locals()) - echo_stderr(msg, fg='red') - echo_stderr(traceback.format_exc()) - ctx.exit(2) + raise ScancodeError(msg + '\n' + traceback.format_exc()) codebase_attributes = OrderedDict() for _, name, attribs in sorted(sortable_codebase_attributes): @@ -685,7 +787,7 @@ def scancode(ctx, input, # NOQA inventory_start = time() if not quiet: - echo_stderr('Collect file inventory...', fg='green') + echo_func('Collect file inventory...', fg='green') if from_json: codebase_class = VirtualCodebase @@ -708,9 +810,7 @@ def scancode(ctx, input, # NOQA ) except: msg = 'ERROR: failed to collect codebase at: %(input)r' % locals() - echo_stderr(msg, fg='red') - echo_stderr(traceback.format_exc()) - ctx.exit(2) + raise ScancodeError(msg + '\n' + traceback.format_exc()) # update headers cle = codebase.get_or_create_current_header() @@ -718,7 +818,7 @@ def scancode(ctx, input, # NOQA cle.tool_name = 'scancode-toolkit' cle.tool_version = scancode_version cle.notice = notice - cle.options = get_pretty_params(ctx, generic_paths=test_mode) + cle.options = pretty_params or {} # TODO: this is weird: may be the timings should NOT be stored on the # codebase, since they exist in abstract of it?? @@ -735,10 +835,10 @@ def scancode(ctx, input, # NOQA # TODO: add progress indicator pre_scan_success = run_codebase_plugins( - ctx, stage='pre-scan', plugins=pre_scan_plugins, codebase=codebase, + stage='pre-scan', plugins=pre_scan_plugins, codebase=codebase, stage_msg='Run %(stage)ss...', plugin_msg=' Run %(stage)s: %(name)s...', - quiet=quiet, verbose=verbose, kwargs=kwargs, + quiet=quiet, verbose=verbose, kwargs=kwargs, echo_func=echo_func, ) success = success and pre_scan_success @@ -747,9 +847,9 @@ def scancode(ctx, input, # NOQA ######################################################################## scan_success = run_scanners( - ctx, stage='scan', plugins=scanner_plugins, codebase=codebase, + stage='scan', plugins=scanner_plugins, codebase=codebase, processes=processes, timeout=timeout, timing=timeout, - quiet=quiet, verbose=verbose, kwargs=kwargs, + quiet=quiet, verbose=verbose, kwargs=kwargs, echo_func=echo_func, ) success = success and scan_success @@ -759,9 +859,9 @@ def scancode(ctx, input, # NOQA # TODO: add progress indicator post_scan_success = run_codebase_plugins( - ctx, stage='post-scan', plugins=post_scan_plugins, codebase=codebase, + stage='post-scan', plugins=post_scan_plugins, codebase=codebase, stage_msg='Run %(stage)ss...', plugin_msg=' Run %(stage)s: %(name)s...', - quiet=quiet, verbose=verbose, kwargs=kwargs, + quiet=quiet, verbose=verbose, kwargs=kwargs, echo_func=echo_func, ) success = success and post_scan_success @@ -771,9 +871,9 @@ def scancode(ctx, input, # NOQA # TODO: add progress indicator output_filter_success = run_codebase_plugins( - ctx, stage='output-filter', plugins=output_filter_plugins, codebase=codebase, + stage='output-filter', plugins=output_filter_plugins, codebase=codebase, stage_msg='Apply %(stage)ss...', plugin_msg=' Apply %(stage)s: %(name)s...', - quiet=quiet, verbose=verbose, kwargs=kwargs, + quiet=quiet, verbose=verbose, kwargs=kwargs, echo_func=echo_func, ) success = success and output_filter_success @@ -793,14 +893,17 @@ def scancode(ctx, input, # NOQA errors = collect_errors(codebase, verbose) cle.errors = errors - # TODO: add progress indicator - output_success = run_codebase_plugins( - ctx, stage='output', plugins=output_plugins, codebase=codebase, - stage_msg='Save scan results...', - plugin_msg=' Save scan results as: %(name)s...', - quiet=quiet, verbose=verbose, kwargs=kwargs, - ) - success = success and output_success + # when called from Python we can only get results back and not have + # any output plugin + if output_plugins: + # TODO: add progress indicator + output_success = run_codebase_plugins( + stage='output', plugins=output_plugins, codebase=codebase, + stage_msg='Save scan results...', + plugin_msg=' Save scan results as: %(name)s...', + quiet=quiet, verbose=verbose, kwargs=kwargs, echo_func=echo_func, + ) + success = success and output_success ######################################################################## # 9. display summary @@ -810,8 +913,17 @@ def scancode(ctx, input, # NOQA # TODO: compute summary for output plugins too?? if not quiet: scan_names = ', '.join(p.name for p in scanner_plugins) - echo_stderr('Scanning done.', fg='green' if success else 'red') - display_summary(codebase, scan_names, processes, errors=errors, verbose=verbose) + echo_func('Scanning done.', fg='green' if success else 'red') + display_summary(codebase, scan_names, processes, errors=errors, + verbose=verbose, echo_func=echo_func) + + ######################################################################## + # 10. optionally assemble results to return + ######################################################################## + if return_results: + # the structure is exactly the same as the JSON output + from formattedcode.output_json import get_results + results = get_results(codebase, as_list=True, **kwargs) finally: # remove temporary files @@ -819,24 +931,24 @@ def scancode(ctx, input, # NOQA if keep_temp_files: if not quiet: msg = 'Keeping temporary files in: "{}".'.format(scancode_temp_dir) - echo_stderr(msg, fg='green' if success else 'red') + echo_func(msg, fg='green' if success else 'red') else: if not quiet: - echo_stderr('Removing temporary files...', fg='green', nl=False) + echo_func('Removing temporary files...', fg='green', nl=False) from commoncode import fileutils fileutils.delete(scancode_temp_dir) if not quiet: - echo_stderr('done.', fg='green') + echo_func('done.', fg='green') - rc = 0 if success else 1 - ctx.exit(rc) + return success, results -def run_codebase_plugins(ctx, stage, plugins, codebase, +def run_codebase_plugins(stage, plugins, codebase, stage_msg='', plugin_msg='', - quiet=False, verbose=False, kwargs=None): + quiet=False, verbose=False, kwargs=None, + echo_func=echo_stderr): """ Run the list of `stage` `plugins` on `codebase`. Display errors and messages based on the `stage_msg`and `plugin_msg` strings @@ -849,7 +961,7 @@ def run_codebase_plugins(ctx, stage, plugins, codebase, stage_start = time() if verbose and plugins: - echo_stderr(stage_msg % locals(), fg='green') + echo_func(stage_msg % locals(), fg='green') success = True # TODO: add progress indicator @@ -858,7 +970,7 @@ def run_codebase_plugins(ctx, stage, plugins, codebase, plugin_start = time() if verbose: - echo_stderr(plugin_msg % locals(), fg='green') + echo_func(plugin_msg % locals(), fg='green') try: if TRACE_DEEP: @@ -871,9 +983,9 @@ def run_codebase_plugins(ctx, stage, plugins, codebase, except Exception as _e: msg = 'ERROR: failed to run %(stage)s plugin: %(name)s:' % locals() - echo_stderr(msg, fg='red') + echo_func(msg, fg='red') tb = traceback.format_exc() - echo_stderr(tb) + echo_func(tb) codebase.errors.append(msg + '\n' + tb) success = False @@ -884,9 +996,10 @@ def run_codebase_plugins(ctx, stage, plugins, codebase, return success -def run_scanners(ctx, stage, plugins, codebase, +def run_scanners(stage, plugins, codebase, processes, timeout, timing, - quiet=False, verbose=False, kwargs=None): + quiet=False, verbose=False, kwargs=None, + echo_func=echo_stderr): """ Run the list of `stage` ScanPlugin `plugins` on `codebase`. Use multiple `processes` and limit the runtime of a single scanner function @@ -916,7 +1029,7 @@ def run_scanners(ctx, stage, plugins, codebase, progress_manager = None if not quiet: - echo_stderr('Scan files for: %(scan_names)s ' + echo_func('Scan files for: %(scan_names)s ' 'with %(processes)d process(es)...' % locals()) item_show_func = partial(path_progress_message, verbose=verbose) progress_manager = partial(progressmanager, @@ -931,7 +1044,7 @@ def run_scanners(ctx, stage, plugins, codebase, # TODO: add progress indicator # run the process codebase of each scan plugin (most often a no-op) scan_process_codebase_success = run_codebase_plugins( - ctx, stage, plugins, codebase, + stage, plugins, codebase, stage_msg='Filter %(stage)ss...', plugin_msg=' Filter %(stage)s: %(name)s...', quiet=quiet, verbose=verbose, kwargs=kwargs, @@ -956,7 +1069,7 @@ def run_scanners(ctx, stage, plugins, codebase, def scan_codebase(codebase, scanners, processes=1, timeout=DEFAULT_TIMEOUT, - with_timing=False, progress_manager=None): + with_timing=False, progress_manager=None, echo_func=echo_stderr): """ Run the `scanners` Scanner objects on the `codebase` Codebase. Return True on success or False otherwise. @@ -1062,7 +1175,7 @@ def scan_codebase(codebase, scanners, processes=1, timeout=DEFAULT_TIMEOUT, except StopIteration: break except KeyboardInterrupt: - echo_stderr('\nAborted with Ctrl+C!', fg='red') + echo_func('\nAborted with Ctrl+C!', fg='red') success = False if pool: pool.terminate() @@ -1139,7 +1252,7 @@ def scan_resource(location_rid, scanners, timeout=DEFAULT_TIMEOUT, return location, rid, scan_errors, scan_time, results, timings -def display_summary(codebase, scan_names, processes, errors, verbose): +def display_summary(codebase, scan_names, processes, errors, verbose, echo_func=echo_stderr): """ Display a scan summary. """ @@ -1213,39 +1326,39 @@ def display_summary(codebase, scan_names, processes, errors, verbose): errors_count = len(errors) if errors: - echo_stderr('Some files failed to scan properly:', fg='red') + echo_func('Some files failed to scan properly:', fg='red') for error in errors: for me in error.splitlines(False): - echo_stderr(me , fg='red') + echo_func(me , fg='red') ###################################################################### - echo_stderr('Summary: %(scan_names)s with %(processes)d process(es)' % locals()) - echo_stderr('Errors count: %(errors_count)d' % locals()) - echo_stderr('Scan Speed: %(scan_file_speed).2f files/sec. %(scan_size_speed)s' % locals()) + echo_func('Summary: %(scan_names)s with %(processes)d process(es)' % locals()) + echo_func('Errors count: %(errors_count)d' % locals()) + echo_func('Scan Speed: %(scan_file_speed).2f files/sec. %(scan_size_speed)s' % locals()) if prescan_scan_time: - echo_stderr('Early Scanners Speed: %(prescan_scan_file_speed).2f ' + echo_func('Early Scanners Speed: %(prescan_scan_file_speed).2f ' 'files/sec. %(prescan_scan_size_speed)s' % locals()) - echo_stderr('Initial counts: %(initial_res_count)d resource(s): ' - '%(initial_files_count)d file(s) ' - 'and %(initial_dirs_count)d directorie(s) ' - '%(initial_size_count)s' % locals()) + echo_func('Initial counts: %(initial_res_count)d resource(s): ' + '%(initial_files_count)d file(s) ' + 'and %(initial_dirs_count)d directorie(s) ' + '%(initial_size_count)s' % locals()) - echo_stderr('Final counts: %(final_res_count)d resource(s): ' - '%(final_files_count)d file(s) ' - 'and %(final_dirs_count)d directorie(s) ' - '%(final_size_count)s' % locals()) + echo_func('Final counts: %(final_res_count)d resource(s): ' + '%(final_files_count)d file(s) ' + 'and %(final_dirs_count)d directorie(s) ' + '%(final_size_count)s' % locals()) - echo_stderr('Timings:') + echo_func('Timings:') cle = codebase.get_or_create_current_header().to_dict() - echo_stderr(' scan_start: {start_timestamp}'.format(**cle)) - echo_stderr(' scan_end: {end_timestamp}'.format(**cle)) + echo_func(' scan_start: {start_timestamp}'.format(**cle)) + echo_func(' scan_end: {end_timestamp}'.format(**cle)) for name, value, in codebase.timings.items(): if value > 0.1: - echo_stderr(' %(name)s: %(value).2fs' % locals()) + echo_func(' %(name)s: %(value).2fs' % locals()) # TODO: if timing was requested display top per-scan/per-file stats? diff --git a/src/scancode/plugin_ignore.py b/src/scancode/plugin_ignore.py index 3f5da101089..961f18721b4 100644 --- a/src/scancode/plugin_ignore.py +++ b/src/scancode/plugin_ignore.py @@ -27,17 +27,40 @@ from functools import partial -from commoncode.fileset import match +from commoncode.fileset import is_included from plugincode.pre_scan import PreScanPlugin from plugincode.pre_scan import pre_scan_impl from scancode import CommandLineOption from scancode import PRE_SCAN_GROUP +# Tracing flags +TRACE = False + + +def logger_debug(*args): + pass + + +if TRACE: + import logging + import sys + + logger = logging.getLogger(__name__) + # logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) + + def logger_debug(*args): + return logger.debug( + ' '.join(isinstance(a, unicode) and a or repr(a) for a in args)) + + + @pre_scan_impl class ProcessIgnore(PreScanPlugin): """ - Ignore files matching the supplied pattern. + Include or ignore files matching patterns. """ options = [ @@ -46,48 +69,69 @@ class ProcessIgnore(PreScanPlugin): metavar='', help='Ignore files matching .', sort_order=10, + help_group=PRE_SCAN_GROUP), + CommandLineOption(('--include',), + multiple=True, + metavar='', + help='Include files matching .', + sort_order=11, help_group=PRE_SCAN_GROUP) ] - def is_enabled(self, ignore, **kwargs): - return ignore + def is_enabled(self, ignore, include, **kwargs): + return ignore or include - def process_codebase(self, codebase, ignore=(), **kwargs): + def process_codebase(self, codebase, ignore=(), include=(), **kwargs): """ - Remove ignored Resources from the resource tree. + Keep only included and non-ignored Resources in the codebase. """ - if not ignore: + if not (ignore or include): return - ignores = { + excludes = { pattern: 'User ignore: Supplied by --ignore' for pattern in ignore } - ignorable = partial(is_ignored, ignores=ignores) - rids_to_remove = [] - remove_resource = codebase.remove_resource + includes = { + pattern: 'User include: Supplied by --include' for pattern in include + } + + included = partial(is_included, includes=includes, excludes=excludes) + + rids_to_remove = set() + rids_to_remove_add = rids_to_remove.add + rids_to_remove_discard = rids_to_remove.discard # First, walk the codebase from the top-down and collect the rids of # Resources that can be removed. for resource in codebase.walk(topdown=True): - if ignorable(resource.path): + if resource.is_root: + continue + resource_rid = resource.rid + + if not included(resource.path): for child in resource.children(codebase): - rids_to_remove.append(child.rid) - rids_to_remove.append(resource.rid) + rids_to_remove_add(child.rid) + rids_to_remove_add(resource_rid) + else: + # we may have been selected for removal based on a parent dir + # but may be explicitly included. Honor that + rids_to_remove_discard(resource_rid) + if TRACE: + logger_debug('process_codebase: rids_to_remove') + logger_debug(rids_to_remove) + for rid in sorted(rids_to_remove): + logger_debug(codebase.get_resource(rid)) - # Then, walk bottom-up and remove the ignored Resources from the + remove_resource = codebase.remove_resource + + # Then, walk bottom-up and remove the non-included Resources from the # Codebase if the Resource's rid is in our list of rid's to remove. for resource in codebase.walk(topdown=False): resource_rid = resource.rid + if resource.is_root: + continue if resource_rid in rids_to_remove: - rids_to_remove.remove(resource_rid) + rids_to_remove_discard(resource_rid) remove_resource(resource) - - -def is_ignored(location, ignores): - """ - Return a tuple of (pattern , message) if a file at location is ignored or - False otherwise. `ignores` is a mappings of patterns to a reason. - """ - return match(location, includes=ignores, excludes={}) diff --git a/tests/commoncode/test_fileset.py b/tests/commoncode/test_fileset.py index e776b29fa6a..b20b68a0c99 100644 --- a/tests/commoncode/test_fileset.py +++ b/tests/commoncode/test_fileset.py @@ -39,65 +39,65 @@ def test_load(self): result = fileset.load(irf) assert ['/foo/*', '!/foobar/*', 'bar/*', '#comment'] == result - def test_match_basic(self): - assert not fileset.match('/common/src/', {}, {}) - assert not fileset.match('/common/src/', None, None) - assert not fileset.match(None, None, None) + def test_is_included_basic(self): + assert fileset.is_included('/common/src/', {}, {}) + assert fileset.is_included('/common/src/', None, None) + assert not fileset.is_included(None, None, None) - def test_in_fileset(self): + def test_is_included_in_fileset(self): incs = {'/common/src/*': '.scanignore'} excs = {'/common/src/*.so':'.scanignore'} - assert not fileset.match(None, incs, excs) - assert not fileset.match('', incs, excs) - assert not fileset.match('/', incs, excs) - assert fileset.match('/common/src/', incs, excs) - assert not fileset.match('/common/bin/', incs, excs) + assert not fileset.is_included(None, incs, excs) + assert not fileset.is_included('', incs, excs) + assert not fileset.is_included('/', incs, excs) + assert fileset.is_included('/common/src/', incs, excs) + assert not fileset.is_included('/common/bin/', incs, excs) - def test_in_fileset_2(self): + def test_is_included_in_fileset_2(self): incs = {'src*': '.scanignore'} excs = {'src/ab': '.scanignore'} - assert not fileset.match(None, incs, excs) - assert not fileset.match('', incs, excs) - assert not fileset.match('/', incs, excs) - assert fileset.match('/common/src/', incs, excs) - assert not fileset.match('src/ab', incs, excs) - assert fileset.match('src/abbab', incs, excs) - - def test_match_exclusions(self): + assert not fileset.is_included(None, incs, excs) + assert not fileset.is_included('', incs, excs) + assert not fileset.is_included('/', incs, excs) + assert fileset.is_included('/common/src/', incs, excs) + assert not fileset.is_included('src/ab', incs, excs) + assert fileset.is_included('src/abbab', incs, excs) + + def test_is_included_is_included_exclusions(self): incs = {'/src/*': '.scanignore'} excs = {'/src/*.so':'.scanignore'} - assert not fileset.match('/src/dist/build/mylib.so', incs, excs) + assert not fileset.is_included('/src/dist/build/mylib.so', incs, excs) - def test_match_exclusions_2(self): + def test_is_included_is_included_exclusions_2(self): incs = {'src': '.scanignore'} excs = {'src/*.so':'.scanignore'} - assert fileset.match('/some/src/this/that', incs, excs) - assert not fileset.match('/src/dist/build/mylib.so', incs, excs) + assert fileset.is_included('/some/src/this/that', incs, excs) + assert not fileset.is_included('/src/dist/build/mylib.so', incs, excs) - def test_match_empty_exclusions(self): + def test_is_included_empty_exclusions(self): incs = {'/src/*': '.scanignore'} excs = {'': '.scanignore'} - assert fileset.match('/src/dist/build/mylib.so', incs, excs) + assert fileset.is_included('/src/dist/build/mylib.so', incs, excs) - def test_match_sources(self): + def test_is_included_sources(self): incs = {'/home/elf/elf-0.5/*': '.scanignore'} excs = {'/home/elf/elf-0.5/src/elf': '.scanignore', '/home/elf/elf-0.5/src/elf.o': '.scanignore'} - assert not fileset.match('/home/elf/elf-0.5/src/elf', incs, excs) + assert not fileset.is_included('/home/elf/elf-0.5/src/elf', incs, excs) - def test_match_dot_svn(self): + def test_is_included_dot_svn(self): incs = {'*/.svn/*': '.scanignore'} excs = {} - assert fileset.match('home/common/tools/elf/.svn/', incs, excs) - assert fileset.match('home/common/tools/.svn/this', incs, excs) - assert not fileset.match('home/common/tools/this', incs, excs) + assert fileset.is_included('home/common/tools/elf/.svn/', incs, excs) + assert fileset.is_included('home/common/tools/.svn/this', incs, excs) + assert not fileset.is_included('home/common/tools/this', incs, excs) - def test_match_dot_svn_with_excludes(self): + def test_is_included_dot_svn_with_excludes(self): incs = {'*/.svn/*': '.scanignore'} excs = {'*/.git/*': '.scanignore'} - assert fileset.match('home/common/tools/elf/.svn/', incs, excs) - assert fileset.match('home/common/tools/.svn/this', incs, excs) - assert not fileset.match('home/common/.git/this', incs, excs) + assert fileset.is_included('home/common/tools/elf/.svn/', incs, excs) + assert fileset.is_included('home/common/tools/.svn/this', incs, excs) + assert not fileset.is_included('home/common/.git/this', incs, excs) def test_get_matches(self): patterns = {'*/.svn/*': '.scanignore'} diff --git a/tests/commoncode/test_ignore.py b/tests/commoncode/test_ignore.py index 0ec21989397..e8322610621 100644 --- a/tests/commoncode/test_ignore.py +++ b/tests/commoncode/test_ignore.py @@ -43,64 +43,56 @@ def test_is_ignored_default_ignores_eclipse1(self): test_base = os.path.join(test_dir, 'eclipse') test = os.path.join(test_base, '.settings') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: Eclipse IDE artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_eclipse2(self): test_dir = self.extract_test_tar('ignore/excludes/eclipse.tgz') test_base = os.path.join(test_dir, 'eclipse') test = os.path.join(test_base, '.settings/somefile') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: Eclipse IDE artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_eclipse3(self): test_dir = self.extract_test_tar('ignore/excludes/eclipse.tgz') test_base = os.path.join(test_dir, 'eclipse') test = os.path.join(test_base, '.project') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: Eclipse IDE artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_eclipse4(self): test_dir = self.extract_test_tar('ignore/excludes/eclipse.tgz') test_base = os.path.join(test_dir, 'eclipse') test = os.path.join(test_base, '.pydevproject') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: Eclipse IDE artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_mac1(self): test_dir = self.extract_test_tar('ignore/excludes/mac.tgz') test_base = os.path.join(test_dir, 'mac') test = os.path.join(test_base, '__MACOSX') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: MacOSX artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_mac2(self): test_dir = self.extract_test_tar('ignore/excludes/mac.tgz') test_base = os.path.join(test_dir, 'mac') test = os.path.join(test_base, '__MACOSX/comp_match/smallrepo/._jetty_1.0_index.csv') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: MacOSX artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_mac3(self): test_dir = self.extract_test_tar('ignore/excludes/mac.tgz') test_base = os.path.join(test_dir, 'mac') test = os.path.join(test_base, '.DS_Store') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: MacOSX artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) def test_is_ignored_default_ignores_mac4(self): test_dir = self.extract_test_tar('ignore/excludes/mac.tgz') test_base = os.path.join(test_dir, 'mac') test = os.path.join(test_base, '.DS_Store/a') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: MacOSX artifact' == result + assert ignore.is_ignored(test, ignore.default_ignores, {}) @skipIf(on_mac, 'Return different result on Mac for reasons to investigate') def test_is_ignored_default_ignores_mac5(self): @@ -108,16 +100,16 @@ def test_is_ignored_default_ignores_mac5(self): test_base = os.path.join(test_dir, 'mac') test = os.path.join(test_base, '._.DS_Store') - result = ignore.is_ignored(test, ignore.default_ignores, {}) # this is really weird as a behavior - assert 'Default ignore: MacOSX artifact' == result + # 'Default ignore: MacOSX artifact' + assert ignore.is_ignored(test, ignore.default_ignores, {}) @skipIf(on_mac, 'Return different result on Mac for reasons to investigate') def test_is_ignored_default_ignores_msft(self): test_dir = self.extract_test_tar('ignore/excludes/msft-vs.tgz') test = os.path.join(test_dir, 'msft-vs/tst.sluo') - result = ignore.is_ignored(test, ignore.default_ignores, {}) - assert 'Default ignore: Microsoft VS project artifact' == result + # 'Default ignore: Microsoft VS project artifact' ?? + assert ignore.is_ignored(test, ignore.default_ignores, {}) @skipIf(on_mac, 'Return different result on Mac for reasons to investigate') def test_is_ignored_skip_vcs_files_and_dirs(self): @@ -144,26 +136,51 @@ def test_is_ignored_skip_vcs_files_and_dirs(self): expected = [ ('/vcs', False), - ('/vcs/.bzr', u'Default ignore: Bazaar artifact'), - ('/vcs/.git', u'Default ignore: Git artifact'), - ('/vcs/.hg', u'Default ignore: Mercurial artifact'), - ('/vcs/.repo', u'Default ignore: Multiple Git repository artifact'), - ('/vcs/.svn', u'Default ignore: SVN artifact'), - ('/vcs/CVS', u'Default ignore: CVS artifact'), - ('/vcs/_darcs', u'Default ignore: Darcs artifact'), - ('/vcs/_MTN', u'Default ignore: Monotone artifact'), - ('/vcs/.bzrignore', u'Default ignore: Bazaar config artifact'), - ('/vcs/.cvsignore', u'Default ignore: CVS config artifact'), - ('/vcs/.gitignore', u'Default ignore: Git config artifact'), - ('/vcs/.hgignore', u'Default ignore: Mercurial config artifact'), - ('/vcs/.svnignore', u'Default ignore: SVN config artifact'), - ('/vcs/vssver.scc', u'Default ignore: Visual Source Safe artifact'), + ('/vcs/.bzr', True), + ('/vcs/.git', True), + ('/vcs/.hg', True), + ('/vcs/.repo', True), + ('/vcs/.svn', True), + ('/vcs/CVS', True), + ('/vcs/_darcs', True), + ('/vcs/_MTN', True), + ('/vcs/.bzrignore', True), + ('/vcs/.cvsignore', True), + ('/vcs/.gitignore', True), + ('/vcs/.hgignore', True), + ('/vcs/.svnignore', True), + ('/vcs/vssver.scc', True), ] assert sorted(expected) == sorted(result) - def test_fileset_match_default_ignore_does_not_skip_one_char_names(self): + def test_fileset_is_included_with_default_ignore_does_not_skip_one_char_names(self): # use fileset directly to work on strings not locations from commoncode import fileset tests = [c for c in 'HFS+ Private Data'] + 'HFS+ Private Data'.split() - for test in tests: - assert False == fileset.match(test, includes=ignore.default_ignores, excludes={}) + result = [(t, + fileset.is_included(t, excludes=ignore.default_ignores, includes={})) + for t in tests] + expected = [ + ('H', True), + ('F', True), + ('S', True), + ('+', True), + (' ', False), + ('P', True), + ('r', True), + ('i', True), + ('v', True), + ('a', True), + ('t', True), + ('e', True), + (' ', False), + ('D', True), + ('a', True), + ('t', True), + ('a', True), + ('HFS+', True), + ('Private', True), + ('Data', True) + ] + + assert expected == result diff --git a/tests/scancode/data/help/help.txt b/tests/scancode/data/help/help.txt index 9f81b83beb7..1dafec753d3 100644 --- a/tests/scancode/data/help/help.txt +++ b/tests/scancode/data/help/help.txt @@ -1,4 +1,4 @@ -Usage: scancode [OPTIONS] +Usage: scancode [OPTIONS] ... scan the file or directory for license, origin and packages and save results to FILE(s) using one or more output format option. @@ -73,6 +73,7 @@ Options: pre-scan: --ignore Ignore files matching . + --include Include files matching . --classify Classify files with flags telling if the file is a legal, or readme or test file, etc. --facet = Add the to files with a path matching diff --git a/tests/scancode/test_cli.py b/tests/scancode/test_cli.py index 2215859d2fa..7c704096c04 100644 --- a/tests/scancode/test_cli.py +++ b/tests/scancode/test_cli.py @@ -630,7 +630,7 @@ def test_scan_does_scan_rpm(): check_json_scan(expected_file, result_file, regen=False) -def test_scan_cli_help(regen=False): +def test_scan_cli_help(regen=True): expected_file = test_env.get_test_loc('help/help.txt') result = run_scan_click(['--help']) if regen: diff --git a/tests/scancode/test_plugin_ignore.py b/tests/scancode/test_plugin_ignore.py index b1851397185..79094756ca2 100644 --- a/tests/scancode/test_plugin_ignore.py +++ b/tests/scancode/test_plugin_ignore.py @@ -29,9 +29,9 @@ from os.path import join from commoncode.testcase import FileDrivenTesting +from commoncode.fileset import is_included from scancode.cli_test_utils import run_scan_click from scancode.cli_test_utils import load_json_result -from scancode.plugin_ignore import is_ignored from scancode.plugin_ignore import ProcessIgnore from scancode.resource import Codebase @@ -40,30 +40,30 @@ class TestPluginIgnoreFiles(FileDrivenTesting): test_data_dir = join(dirname(__file__), 'data') - def test_is_ignored_glob_path(self): + def test_is_included_glob_path(self): location = 'common/src/test/sample.txt' - ignores = {'*/src/test/*': 'test ignore'} - assert is_ignored(location=location, ignores=ignores) + excludes = {'*/src/test/*': 'test ignore'} + assert not is_included(location, excludes=excludes) - def test_is_ignored_single_path(self): + def test_is_included_single_path(self): location = 'common/src/test/sample.txt' - ignores = {'common/src/test/sample.txt': 'test ignore'} - assert is_ignored(location=location, ignores=ignores) + excludes = {'common/src/test/sample.txt': 'test ignore'} + assert not is_included(location, excludes=excludes) - def test_is_ignored_single_path_not_matching(self): + def test_is_included_single_path_not_matching(self): location = 'common/src/test/sample.txt' - ignores = {'src/test/sample.txt': 'test ignore'} - assert not is_ignored(location=location, ignores=ignores) + excludes = {'src/test/sample.txt': 'test ignore'} + assert is_included(location, excludes=excludes) - def test_is_ignored_single_file(self): + def test_is_included_single_file(self): location = 'common/src/test/sample.txt' - ignores = {'sample.txt': 'test ignore'} - assert is_ignored(location=location, ignores=ignores) + excludes = {'sample.txt': 'test ignore'} + assert not is_included(location, excludes=excludes) - def test_is_ignored_glob_file(self): + def test_is_included_glob_file(self): location = 'common/src/test/sample.txt' - ignores = {'*.txt': 'test ignore'} - assert is_ignored(location=location, ignores=ignores) + excludes = {'*.txt': 'test ignore'} + assert not is_included(location, excludes=excludes) def check_ProcessIgnore(self, test_dir, expected, ignore): codebase = Codebase(test_dir, strip_root=True) diff --git a/tests/scancode/test_resource.py b/tests/scancode/test_resource.py index 5a220a10dcc..0687cf40504 100644 --- a/tests/scancode/test_resource.py +++ b/tests/scancode/test_resource.py @@ -437,17 +437,7 @@ def test_compute_counts_when_using_disk_cache(self): def test_low_max_in_memory_does_not_raise_exception_when_ignoring_files(self): - def is_ignored(location, ignores): - """ - Return a tuple of (pattern , message) if a file at location is ignored or - False otherwise. `ignores` is a mappings of patterns to a reason. - - Taken from scancode/plugin_ignore.py - """ - from commoncode.fileset import match - return match(location, includes=ignores, excludes={}) - - from functools import partial + from commoncode.fileset import is_included test_codebase = self.get_test_loc('resource/client') codebase = Codebase(test_codebase, strip_root=True, max_in_memory=1) @@ -456,14 +446,14 @@ def is_ignored(location, ignores): ignores = { '*.gif': 'User ignore: Supplied by --ignore' } - ignorable = partial(is_ignored, ignores=ignores) remove_resource = codebase.remove_resource for resource in codebase.walk(topdown=True): - if ignorable(resource.path): + if not is_included(resource.path, excludes=ignores): for child in resource.children(codebase): remove_resource(child) - remove_resource(resource) + if not resource.is_root: + remove_resource(resource) # Walk through the codebase and save each Resource, # UnknownResource exception should not be raised