diff --git a/etc/scripts/genlicspdx.py b/etc/scripts/genlicspdx.py
new file mode 100644
index 00000000000..c4355844759
--- /dev/null
+++ b/etc/scripts/genlicspdx.py
@@ -0,0 +1,113 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# Data generated with ScanCode require an acknowledgment.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+#
+# When you publish or redistribute any data created with ScanCode or any ScanCode
+# derivative work, you must accompany this data with the following acknowledgment:
+#
+# Generated with ScanCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
+# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
+# ScanCode should be considered or used as legal advice. Consult an Attorney
+# for any legal advice.
+# ScanCode is a free software code scanning tool from nexB Inc. and others.
+# Visit https://github.com/nexB/scancode-toolkit/ for support and download.
+
+from __future__ import absolute_import
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import os
+
+import click
+click.disable_unicode_literals_warning = True
+
+from licensedcode.models import load_licenses
+from scancode.cli import run_scan
+
+
+"""
+Generate an SPDX document for each license known in ScanCode that are not usted
+at SPDX.
+Run python genlicspdx.py -h for help.
+
+NOTE: this is rather inefficient as it is starting a new command line process
+for each license, taking a few seconds each time.
+Upcomming code to call a scan function instead will be more efficient.
+"""
+
+FOSS_CATEGORIES = set([
+ 'Copyleft',
+ 'Copyleft Limited',
+ 'Patent License',
+ 'Permissive',
+ 'Public Domain',
+])
+
+
+@click.command()
+@click.argument('license_dir',
+ type=click.Path(file_okay=False, exists=True, writable=True,
+ allow_dash=False, resolve_path=True),
+ metavar='DIR')
+@click.option('-v', '--verbose', is_flag=True, default=False, help='Print execution messages.')
+@click.help_option('-h', '--help')
+def cli(license_dir, verbose):
+ """
+ Create one SPDX tag-value document for each non-SPDX ScanCode licenses.
+ Store these in the DIR directory
+ """
+
+ base_kwargs = dict(
+ license=True, license_diag=True, license_text=True, info=True,
+ strip_root=True, quiet=True, return_results=False)
+
+ licenses_by_key = load_licenses(with_deprecated=False)
+
+
+ for i, lic in enumerate(licenses_by_key.values()):
+ ld = lic.to_dict()
+
+ if lic.spdx_license_key:
+ if verbose:
+ click.echo(
+ 'Skipping ScanCode: {key} that is an SPDX license: {spdx_license_key}'.format(**ld))
+ continue
+
+ if not lic.text_file or not os.path.exists(lic.text_file):
+ if verbose:
+ click.echo(
+ 'Skipping license without text: {key}'.format(**ld))
+ continue
+
+ if lic.category not in FOSS_CATEGORIES:
+ if verbose:
+ click.echo(
+ 'Skipping non FOSS license: {key}'.format(**ld))
+ continue
+
+ output = 'licenseref-scancode-{key}.spdx'.format(**ld)
+ output = os.path.join(license_dir, output)
+
+ if verbose:
+ click.echo('Creating SPDX document for license: {key}'.format(**ld))
+ click.echo('at: {output}'.format(**locals()))
+
+ with open(output, 'wb') as ouput_file:
+ kwargs = dict(input=lic.text_file, spdx_tv=ouput_file)
+ kwargs.update(base_kwargs)
+ run_scan(**kwargs)
+
+
+if __name__ == '__main__':
+ cli()
diff --git a/etc/scripts/scancli.py b/etc/scripts/scancli.py
new file mode 100644
index 00000000000..d5829a421fd
--- /dev/null
+++ b/etc/scripts/scancli.py
@@ -0,0 +1,74 @@
+#
+# Copyright (c) 2019 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import json
+from os.path import abspath
+from os.path import dirname
+from os.path import join
+from os.path import normpath
+
+import execnet
+
+import scanserv
+
+"""
+This is a module designed to be called from Python 2 or 3 and is the client
+side. See scanserv for the back server module that runs on Python 2 and runs
+effectively scancode.
+"""
+
+
+def scan(locations, deserialize=False, scancode_root_dir=None):
+ """
+ Scan the list of paths at `location` and return the results as an iterable
+ of JSON strings. If `deserialize` is True the iterable contains a python data
+ instead.
+ Each location is scanned independently.
+ """
+ if not scancode_root_dir:
+ scancode_root_dir = abspath(normpath(__file__))
+ scancode_root_dir = dirname(dirname(dirname(scancode_root_dir)))
+ python2 = join(scancode_root_dir, 'bin', 'python')
+ spec = 'popen//python={python2}'.format(**locals())
+ gateway = execnet.makegateway(spec) # NOQA
+ channel = gateway.remote_exec(scanserv)
+
+ for location in locations:
+ # build a mapping of options to use for this scan
+ scan_kwargs = dict(
+ location=location,
+ license=True,
+ license_text=True,
+ license_diag=True,
+ copyright=True,
+ info=True,
+ processes=0,
+ )
+
+ channel.send(scan_kwargs) # execute func-call remotely
+ results = channel.receive()
+ if deserialize:
+ results = json.loads(results)
+ yield results
+
+
+if __name__ == '__main__':
+ import sys # NOQA
+ args = sys.argv[1:]
+ for s in scan(args):
+ print(s)
diff --git a/etc/scripts/scanserv.README b/etc/scripts/scanserv.README
new file mode 100644
index 00000000000..1da44b34312
--- /dev/null
+++ b/etc/scripts/scanserv.README
@@ -0,0 +1,29 @@
+A simple proof of concept for Python3 remoting with execnet.
+
+See ticket #1400 for more.
+
+This is an example of how to call Scancode as a function from Python2 or Python3.
+The benefits are that when the server process has loaded the license index,
+and imported its modules there is no per-call import/loading penalty anymore.
+
+This is using execnet which is the multiprocessing library used by
+py.test and therefore a rather stable and high quality engine.
+
+To test, do this::
+
+1. checkout scancode and run ./configure in a first shell. This is for a plain
+ScanCode using Python 2 that will be used as a "server".
+
+2. in another shell, create a virtualenv with Python 3 in another
+ location. Activate that venv, and `pip install simplejson execnet`
+
+3. Change dir to the install scancode-toolkit/etc/scripts where the scancli.py
+and scancserv.py scripts are. Then run::
+
+ python3 scancli.py ../../NOTICE ../../setup.py
+
+This will effectively make remote functions calls to the Python2
+scancode and gets the result in Python3 alright. It also allows to have
+multiple calls that reuse the same process, hence amortizing any startup
+costs. Here this will run two scans: one on NOTICE and another on setup.py.
+It could have been directories too.
diff --git a/etc/scripts/scanserv.py b/etc/scripts/scanserv.py
new file mode 100644
index 00000000000..fa49bb73605
--- /dev/null
+++ b/etc/scripts/scanserv.py
@@ -0,0 +1,55 @@
+#
+# Copyright (c) 2019 nexB Inc. and others. All rights reserved.
+# http://nexb.com and https://github.com/nexB/scancode-toolkit/
+# The ScanCode software is licensed under the Apache License version 2.0.
+# ScanCode is a trademark of nexB Inc.
+#
+# You may not use this software except in compliance with the License.
+# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software distributed
+# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+# CONDITIONS OF ANY KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+"""
+Python2 "server side" of the scan server. In a given execnet session, this
+process will hold a loaded license index and can be invoked multiple times
+without the index load penalty on each call.
+"""
+
+
+def as_json(results, pretty=True):
+ """
+ Return a JSON string from a `results` data structuret.
+ """
+ # this is used for its ability to handle iterables as arrays.
+ import simplejson
+
+ kwargs = dict(iterable_as_array=True, encoding='utf-8')
+ if pretty:
+ kwargs.update(dict(indent=2 * b' '))
+ else:
+ kwargs.update(dict(separators=(b',', b':',)))
+ return simplejson.dumps(results, **kwargs) + b'\n'
+
+
+def run_scan(location, **kwargs):
+ from scancode import cli
+ pretty = kwargs.pop('pretty', True)
+ return as_json(cli.run_scan(location, **kwargs), pretty=pretty)
+
+
+if __name__ == '__channelexec__':
+ for kwargs in channel: # NOQA
+ # a mapping of kwargs or a location string
+ if isinstance(kwargs, (str, unicode)):
+ channel.send(run_scan(kwargs)) # NOQA
+ elif isinstance(kwargs, dict):
+ channel.send(run_scan(**kwargs)) # NOQA
+ else:
+ raise Exception('Unknown arguments type: ' + repr(kwargs))
diff --git a/src/commoncode/fileset.py b/src/commoncode/fileset.py
index f01c6db5ae4..aad58783e57 100644
--- a/src/commoncode/fileset.py
+++ b/src/commoncode/fileset.py
@@ -28,21 +28,26 @@
import fnmatch
import os
-import logging
from commoncode import fileutils
from commoncode import paths
from commoncode.system import on_linux
-DEBUG = False
-logger = logging.getLogger(__name__)
-# import sys
-# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
-# logger.setLevel(logging.DEBUG)
+
+TRACE = False
+if TRACE:
+ import logging
+ import sys
+
+ logger = logging.getLogger(__name__)
+ logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
+ logger.setLevel(logging.DEBUG)
+
POSIX_PATH_SEP = b'/' if on_linux else '/'
EMPTY_STRING = b'' if on_linux else ''
+
"""
Match files and directories paths based on inclusion and exclusion glob-style
patterns.
@@ -79,33 +84,44 @@
"""
-def match(path, includes, excludes):
+def is_included(path, includes=None, excludes=None):
"""
- Return a matching pattern value (e.g. a reason message) or False if `path` is matched or not.
- If the `path` is empty, return False.
+ Return a True if `path` is included based on mapping of `includes` and
+ `excludes` glob patterns. If the `path` is empty, return False.
Matching is done based on the set of `includes` and `excludes` patterns maps
- of {fnmtch pattern -> value} where value can be a message string or some other
- object.
- The order of the includes and excludes items does not matter and if a map is
- empty , it is not used for matching.
+ of {fnmatch pattern: message}. If `includes` are provided they are tested
+ first. The `excludes` are tested second if provided.
+
+ The ordering of the includes and excludes items does not matter and if a map
+ is empty, it is not used for matching.
"""
- includes = includes or {}
- excludes = excludes or {}
if not path or not path.strip():
return False
- included = get_matches(path, includes, all_matches=False)
- excluded = get_matches(path, excludes, all_matches=False)
- if DEBUG:
- logger.debug('in_fileset: path: %(path)r included:%(included)r, '
- 'excluded:%(excluded)r .' % locals())
- if excluded:
- return False
- elif included:
- return included
- else:
- return False
+ if not includes and not excludes:
+ return True
+
+ includes = includes or {}
+ includes = {k: v for k, v in includes.items() if k}
+ excludes = excludes or {}
+ excludes = {k: v for k, v in excludes.items() if k}
+
+ if includes:
+ included = get_matches(path, includes, all_matches=False)
+ if TRACE:
+ logger.debug('in_fileset: path: %(path)r included:%(included)r' % locals())
+ if not included:
+ return False
+
+ if excludes:
+ excluded = get_matches(path, excludes, all_matches=False)
+ if TRACE:
+ logger.debug('in_fileset: path: %(path)r excluded:%(excluded)r .' % locals())
+ if excluded:
+ return False
+
+ return True
def get_matches(path, patterns, all_matches=False):
@@ -122,13 +138,17 @@ def get_matches(path, patterns, all_matches=False):
pathstripped = path.lstrip(POSIX_PATH_SEP)
if not pathstripped:
return False
+
segments = paths.split(pathstripped)
- if DEBUG:
+
+ if TRACE:
logger.debug('_match: path: %(path)r patterns:%(patterns)r.' % locals())
+
matches = []
if not isinstance(patterns, dict):
assert isinstance(patterns, (list, tuple)), 'Invalid patterns: {}'.format(patterns)
patterns = {p: p for p in patterns}
+
for pat, value in patterns.items():
if not pat or not pat.strip():
continue
@@ -146,8 +166,9 @@ def get_matches(path, patterns, all_matches=False):
matches.append(value)
if not all_matches:
break
- if DEBUG:
+ if TRACE:
logger.debug('_match: matches: %(matches)r' % locals())
+
if not all_matches:
if matches:
return matches[0]
@@ -183,6 +204,7 @@ def includes_excludes(patterns, message):
excluded = {}
if not patterns:
return included, excluded
+
for pat in patterns:
pat = pat.strip()
if not pat or pat.startswith(POUND):
diff --git a/src/commoncode/fileutils.py b/src/commoncode/fileutils.py
index dd8c8494ba4..5debd750582 100644
--- a/src/commoncode/fileutils.py
+++ b/src/commoncode/fileutils.py
@@ -91,6 +91,8 @@ def logger_debug(*args):
ALL_SEPS = POSIX_PATH_SEP + WIN_PATH_SEP
EMPTY_STRING = b'' if on_linux else ''
DOT = b'.' if on_linux else '.'
+PATH_SEP = bytes(os.sep) if on_linux else unicode(os.sep)
+
"""
File, paths and directory utility functions.
diff --git a/src/commoncode/ignore.py b/src/commoncode/ignore.py
index d04e4892342..58f9e8831f4 100644
--- a/src/commoncode/ignore.py
+++ b/src/commoncode/ignore.py
@@ -44,7 +44,7 @@ def is_ignored(location, ignores, unignores=None, skip_special=True):
"""
if skip_special and filetype.is_special(location):
return True
- return fileset.match(location, includes=ignores, excludes=unignores)
+ return not fileset.is_included(location, includes=unignores, excludes=ignores)
def is_ignore_file(location):
diff --git a/src/formattedcode/output_json.py b/src/formattedcode/output_json.py
index 220dc3802f8..926ba86acca 100644
--- a/src/formattedcode/output_json.py
+++ b/src/formattedcode/output_json.py
@@ -76,8 +76,8 @@ def is_enabled(self, output_json, **kwargs):
return output_json
def process_codebase(self, codebase, output_json, **kwargs):
- files = self.get_files(codebase, **kwargs)
- write_json(codebase, files, output_file=output_json, pretty=False)
+ results = get_results(codebase, as_list=False, **kwargs)
+ write_json(results, output_file=output_json, pretty=False)
@output_impl
@@ -96,35 +96,47 @@ def is_enabled(self, output_json_pp, **kwargs):
return output_json_pp
def process_codebase(self, codebase, output_json_pp, **kwargs):
- files = self.get_files(codebase, **kwargs)
- write_json(codebase, files, output_file=output_json_pp, pretty=True, **kwargs)
+ results = get_results(codebase, as_list=False, **kwargs)
+ write_json(results, output_file=output_json_pp, pretty=True)
-def write_json(codebase, files, output_file,
- include_summary=False, include_score=False,
- pretty=False, **kwargs):
- # NOTE: we write as binary, not text
+def write_json(results, output_file, pretty=False, **kwargs):
+ """
+ Write `results` to the `output_file` opened file-like object.
+ """
+ # NOTE: we write as encoded, binary bytes, not as unicode, decoded text
+ kwargs = dict(iterable_as_array=True, encoding='utf-8')
+ if pretty:
+ kwargs.update(dict(indent=2 * b' '))
+ else:
+ kwargs.update(dict(separators=(b',', b':',)))
+ output_file.write(simplejson.dumps(results, **kwargs))
+ output_file.write(b'\n')
+
+
+def get_results(codebase, as_list=False, **kwargs):
+ """
+ Return an ordered mapping of scan results collected from a `codebase`.
+ if `as_list` consume the "files" iterator in a list sequence.
+ """
codebase.add_files_count_to_current_header()
- scan = OrderedDict([(b'headers', codebase.get_headers()), ])
+ results = OrderedDict([('headers', codebase.get_headers()), ])
# add codebase toplevel attributes such as summaries
if codebase.attributes:
- scan.update(codebase.attributes.to_dict())
+ results.update(codebase.attributes.to_dict())
+
+ files = OutputPlugin.get_files(codebase, **kwargs)
+ if as_list:
+ files = list(files)
+ results['files'] = files
if TRACE:
- logger_debug('write_json: files')
+ logger_debug('get_results: files')
files = list(files)
from pprint import pformat
logger_debug(pformat(files))
- scan[b'files'] = files
-
- kwargs = dict(iterable_as_array=True, encoding='utf-8')
- if pretty:
- kwargs.update(dict(indent=2 * b' '))
- else:
- kwargs.update(dict(separators=(b',', b':',)))
+ return results
- output_file.write(simplejson.dumps(scan, **kwargs))
- output_file.write(b'\n')
diff --git a/src/formattedcode/output_jsonlines.py b/src/formattedcode/output_jsonlines.py
index 8914e64ef4c..770988b667c 100644
--- a/src/formattedcode/output_jsonlines.py
+++ b/src/formattedcode/output_jsonlines.py
@@ -53,6 +53,7 @@ class JsonLinesOutput(OutputPlugin):
def is_enabled(self, output_json_lines, **kwargs):
return output_json_lines
+ # TODO: reuse the json output code and merge that in a single plugin
def process_codebase(self, codebase, output_json_lines, **kwargs):
#NOTE: we write as binary, not text
files = self.get_files(codebase, **kwargs)
diff --git a/src/plugincode/__init__.py b/src/plugincode/__init__.py
index 7ce3ec12d5a..74e7bab3779 100644
--- a/src/plugincode/__init__.py
+++ b/src/plugincode/__init__.py
@@ -189,8 +189,9 @@ def load_plugins(cls):
for stage, manager in cls.managers.items():
mgr_setup = manager.setup()
if not mgr_setup:
+ from scancode import ScancodeError
msg = 'Cannot load ScanCode plugins for stage: %(stage)s' % locals()
- raise Exception(msg)
+ raise ScancodeError(msg)
mplugin_classes, mplugin_options = mgr_setup
plugin_classes.extend(mplugin_classes)
plugin_options.extend(mplugin_options)
@@ -202,7 +203,7 @@ def setup(self):
all plugin classes).
Load and validate available plugins for this PluginManager from its
- assigned `entrypoint`. Raise an Exception if a plugin is not valid such
+ assigned `entrypoint`. Raise a ScancodeError if a plugin is not valid such
that when it does not subcclass the manager `plugin_base_class`.
Must be called once to setup the plugins of this manager.
"""
@@ -215,7 +216,7 @@ def setup(self):
entrypoint = self.entrypoint
try:
self.manager.load_setuptools_entrypoints(entrypoint)
- except ImportError, e:
+ except ImportError as e:
raise e
stage = self.stage
@@ -227,7 +228,8 @@ def setup(self):
if not issubclass(plugin_class, self.plugin_base_class):
qname = '%(stage)s:%(name)s' % locals()
plugin_base_class = self.plugin_base_class
- raise Exception(
+ from scancode import ScancodeError #NOQA
+ raise ScancodeError(
'Invalid plugin: %(qname)r: %(plugin_class)r '
'must extend %(plugin_base_class)r.' % locals())
@@ -236,7 +238,8 @@ def setup(self):
qname = '%(stage)s:%(name)s' % locals()
oname = option.name
clin = CommandLineOption
- raise Exception(
+ from scancode import ScancodeError #NOQA
+ raise ScancodeError(
'Invalid plugin: %(qname)r: option %(oname)r '
'must extend %(clin)r.' % locals())
plugin_options.append(option)
diff --git a/src/scancode/__init__.py b/src/scancode/__init__.py
index 271c66ae7ec..0115e81fcc6 100644
--- a/src/scancode/__init__.py
+++ b/src/scancode/__init__.py
@@ -72,6 +72,15 @@ def logger_debug(*args):
return logger.debug(' '.join(isinstance(a, (unicode, str))
and a or repr(a) for a in args))
+
+class ScancodeError(Exception):
+ """Base exception for scancode errors"""
+
+
+class ScancodeCliUsageError(ScancodeError, click.UsageError):
+ """Exception for command line usage errors"""
+
+
# CLI help groups
SCAN_GROUP = 'primary scans'
SCAN_OPTIONS_GROUP = 'scan options'
diff --git a/src/scancode/cli.py b/src/scancode/cli.py
index abc80a72a30..4a4e088845e 100644
--- a/src/scancode/cli.py
+++ b/src/scancode/cli.py
@@ -37,6 +37,7 @@
from collections import OrderedDict
from functools import partial
from itertools import imap
+import os
import sys
from time import time
import traceback
@@ -47,7 +48,9 @@
# import early
from scancode_config import __version__ as scancode_version
+from commoncode.fileutils import as_posixpath
from commoncode.fileutils import PATH_TYPE
+from commoncode.fileutils import POSIX_PATH_SEP
from commoncode.timeutils import time2tstamp
from plugincode import PluginManager
@@ -59,6 +62,8 @@
from plugincode import output_filter
from plugincode import output
+from scancode import ScancodeError
+from scancode import ScancodeCliUsageError
from scancode import CORE_GROUP
from scancode import DOC_GROUP
from scancode import MISC_GROUP
@@ -260,7 +265,7 @@ def print_options(ctx, param, value):
@click.pass_context
# ensure that the input path is bytes on Linux, unicode elsewhere
-@click.argument('input', metavar='