From f46bc48f7b056670f87cbc25e51a0979c7fff8db Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sun, 24 Jan 2021 12:32:27 +0100 Subject: [PATCH 01/42] Default to 64 bits windows on CI Signed-off-by: Philippe Ombredanne --- etc/ci/azure-win.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/ci/azure-win.yml b/etc/ci/azure-win.yml index 6220857..afe1686 100644 --- a/etc/ci/azure-win.yml +++ b/etc/ci/azure-win.yml @@ -3,7 +3,7 @@ parameters: image_name: '' python_versions: [] test_suites: {} - python_architecture: x86 + python_architecture: x64 jobs: - job: ${{ parameters.job_name }} From 182532f69052c75d3621ff214ede196cbeed16e7 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 25 Jan 2021 12:54:13 +0100 Subject: [PATCH 02/42] Use wheels embedded in virtualenv.pyz Signed-off-by: Philippe Ombredanne --- configure | 2 +- configure.bat | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/configure b/configure index 8f3a68e..d41bf8e 100755 --- a/configure +++ b/configure @@ -27,7 +27,7 @@ function setup { # create a virtualenv on Python mkdir -p $CONFIGURE_ROOT_DIR/tmp wget -O $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz https://bootstrap.pypa.io/virtualenv.pyz - $PYTHON_EXE $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz $CONFIGURE_ROOT_DIR/tmp + $PYTHON_EXE $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz --wheel embed --pip embed --setuptools embed --seeder pip $CONFIGURE_ROOT_DIR/tmp source $CONFIGURE_ROOT_DIR/tmp/bin/activate $CONFIGURE_ROOT_DIR/tmp/bin/pip install --upgrade pip virtualenv setuptools wheel } diff --git a/configure.bat b/configure.bat index f03ea07..ee68f9e 100644 --- a/configure.bat +++ b/configure.bat @@ -87,7 +87,7 @@ set PYTHONDONTWRITEBYTECODE=1 call mkdir "%CFG_ROOT_DIR%tmp" call curl -o "%CFG_ROOT_DIR%tmp\virtualenv.pyz" https://bootstrap.pypa.io/virtualenv.pyz -call %PYTHON_EXECUTABLE% "%CFG_ROOT_DIR%tmp\virtualenv.pyz" "%CFG_ROOT_DIR%tmp" +call %PYTHON_EXECUTABLE% "%CFG_ROOT_DIR%tmp\virtualenv.pyz" --wheel embed --pip embed --setuptools embed --seeder pip "%CFG_ROOT_DIR%tmp" call "%CFG_ROOT_DIR%tmp\Scripts\activate" call "%CFG_ROOT_DIR%tmp\Scripts\pip" install --upgrade pip virtualenv setuptools wheel call "%CFG_ROOT_DIR%tmp\Scripts\pip" install -e .[testing] From cd4e87beb91ea5e9380dfeb19c3530c0a92ff192 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 25 Jan 2021 12:56:18 +0100 Subject: [PATCH 03/42] Do not force an upgrade on virtualenv.pyz embeds Signed-off-by: Philippe Ombredanne --- configure | 1 - configure.bat | 1 - 2 files changed, 2 deletions(-) diff --git a/configure b/configure index d41bf8e..78e7498 100755 --- a/configure +++ b/configure @@ -29,7 +29,6 @@ function setup { wget -O $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz https://bootstrap.pypa.io/virtualenv.pyz $PYTHON_EXE $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz --wheel embed --pip embed --setuptools embed --seeder pip $CONFIGURE_ROOT_DIR/tmp source $CONFIGURE_ROOT_DIR/tmp/bin/activate - $CONFIGURE_ROOT_DIR/tmp/bin/pip install --upgrade pip virtualenv setuptools wheel } diff --git a/configure.bat b/configure.bat index ee68f9e..00cb101 100644 --- a/configure.bat +++ b/configure.bat @@ -89,7 +89,6 @@ call mkdir "%CFG_ROOT_DIR%tmp" call curl -o "%CFG_ROOT_DIR%tmp\virtualenv.pyz" https://bootstrap.pypa.io/virtualenv.pyz call %PYTHON_EXECUTABLE% "%CFG_ROOT_DIR%tmp\virtualenv.pyz" --wheel embed --pip embed --setuptools embed --seeder pip "%CFG_ROOT_DIR%tmp" call "%CFG_ROOT_DIR%tmp\Scripts\activate" -call "%CFG_ROOT_DIR%tmp\Scripts\pip" install --upgrade pip virtualenv setuptools wheel call "%CFG_ROOT_DIR%tmp\Scripts\pip" install -e .[testing] @rem Return a proper return code on failure From 51510cbdb2f2d066d6652695aed40175a37d88a4 Mon Sep 17 00:00:00 2001 From: Steven Esser Date: Thu, 11 Feb 2021 15:56:55 -0500 Subject: [PATCH 04/42] Fix .gitattributes Signed-off-by: Steven Esser --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index c446d38..b79df5c 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,2 @@ # Ignore all Git auto CR/LF line endings conversions -* binary +* -text From 371c11e025d2ea957ac4690951126797450f85b1 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 6 Apr 2021 23:52:40 +0200 Subject: [PATCH 05/42] Use trace not debug for tracing Signed-off-by: Philippe Ombredanne --- src/extractcode/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/extractcode/__init__.py b/src/extractcode/__init__.py index b41efdc..fb7ee0f 100644 --- a/src/extractcode/__init__.py +++ b/src/extractcode/__init__.py @@ -37,10 +37,10 @@ from commoncode.system import on_linux logger = logging.getLogger(__name__) -DEBUG = False -# import sys -# logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) -# logger.setLevel(logging.DEBUG) +TRACE = False +if TRACE: + logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) + logger.setLevel(logging.DEBUG) root_dir = join(dirname(__file__), 'bin') From 056b6c1ae0bf16af0b13c2427978cb320bce9ca4 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 6 Apr 2021 23:58:24 +0200 Subject: [PATCH 06/42] Add new CLI option to support extracting all formats With this all supported archive formats with be tried. Signed-off-by: Philippe Ombredanne --- src/extractcode/api.py | 25 +++++++++--- src/extractcode/archive.py | 81 ++++++++++++++++++++++++++++++-------- src/extractcode/cli.py | 32 +++++++++++++-- src/extractcode/extract.py | 21 ++++++++-- 4 files changed, 131 insertions(+), 28 deletions(-) diff --git a/src/extractcode/api.py b/src/extractcode/api.py index a5bb86c..dc4f121 100644 --- a/src/extractcode/api.py +++ b/src/extractcode/api.py @@ -23,22 +23,37 @@ """ -def extract_archives(location, recurse=True, replace_originals=False, ignore_pattern=()): +def extract_archives( + location, + recurse=True, + replace_originals=False, + ignore_pattern=(), + all_formats=False, +): """ Yield ExtractEvent while extracting archive(s) and compressed files at - `location`. If `recurse` is True, extract nested archives-in-archives - recursively. + `location`. + + If `recurse` is True, extract nested archives-in-archives recursively. + If `all_formats` is True, extract all supported archives formats. + Archives and compressed files are extracted in a directory named "-extract" created in the same directory as the archive. + Note: this API is returning an iterable and NOT a sequence. """ + from extractcode.extract import extract from extractcode import default_kinds + from extractcode import all_kinds + + kinds = all_kinds if all_formats else default_kinds + for xevent in extract( location=location, - kinds=default_kinds, + kinds=kinds, recurse=recurse, replace_originals=replace_originals, - ignore_pattern=ignore_pattern + ignore_pattern=ignore_pattern, ): yield xevent diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index 32b52bc..3b3a3d5 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -38,9 +38,10 @@ from extractcode import patches from extractcode import special_package +from extractcode import libarchive2 from extractcode import patch from extractcode import sevenzip -from extractcode import libarchive2 + from extractcode.uncompress import uncompress_gzip from extractcode.uncompress import uncompress_bzip2 @@ -79,8 +80,19 @@ - http://en.wikipedia.org/wiki/List_of_file_formats#Archive_and_compressed """ -# if strict, all hanlders criteria must be matched for it to be selected -Handler = namedtuple('Handler', ['name', 'filetypes', 'mimetypes', 'extensions', 'kind', 'extractors', 'strict']) +# if strict, all handlers criteria must be matched for a handler to be selected +Handler = namedtuple( + 'Handler', + [ + 'name', + 'filetypes', + 'mimetypes', + 'extensions', + 'kind', + 'extractors', + 'strict', + ] +) def can_extract(location): @@ -96,13 +108,17 @@ def can_extract(location): def should_extract(location, kinds, ignore_pattern=()): """ - Return True if this location should be extracted based on the provided - kinds + Return True if this location should be extracted based on the provided kinds """ location = os.path.abspath(os.path.expanduser(location)) ignore_pattern = {extension : 'User ignore: Supplied by --ignore' for extension in ignore_pattern} should_ignore = is_ignored(location, ignore_pattern) - if get_extractor(location, kinds) and not should_ignore: + extractor = get_extractor(location, kinds=kinds) + + if TRACE_DEEP: + logger.debug(f' should_extract: extractor: {extractor}, should_ignore: {should_ignore}') + + if extractor and not should_ignore: return True @@ -113,15 +129,19 @@ def get_extractor(location, kinds=all_kinds): """ assert location location = os.path.abspath(os.path.expanduser(location)) - extractors = get_extractors(location, kinds) + extractors = get_extractors(location, kinds=kinds) if not extractors: + if TRACE_DEEP: + logger.debug(f' get_extractor: not extractors: {extractors}') return None if len(extractors) == 2: extractor1, extractor2 = extractors - nested_extractor = functional.partial(extract_twice, - extractor1=extractor1, - extractor2=extractor2) + nested_extractor = functional.partial( + extract_twice, + extractor1=extractor1, + extractor2=extractor2, + ) return nested_extractor elif len(extractors) == 1: return extractors[0] @@ -135,23 +155,38 @@ def get_extractors(location, kinds=all_kinds): location or an empty list. """ handler = get_best_handler(location, kinds) + if TRACE_DEEP: + logger.debug(f' get_extractors: handler: {handler}') + return handler and handler.extractors or [] def get_best_handler(location, kinds=all_kinds): """ - Return the best handler of None for the file at location. + Return the best handler for the file at `location` or None . """ location = os.path.abspath(os.path.expanduser(location)) if not filetype.is_file(location): return + handlers = list(get_handlers(location)) if TRACE_DEEP: - logger.debug('get_best_handler: handlers: %(handlers)r ' % locals()) + logger.debug(f' get_best_handler: handlers: {handlers}') + if not handlers: + return + + candidates = list(score_handlers(handlers)) + if TRACE_DEEP: + logger.debug(f' get_best_handler: candidates: {candidates}') + if not candidates: + if TRACE_DEEP: + logger.debug(f' get_best_handler: candidates: {candidates}') + return - if handlers: - candidates = score_handlers(handlers) - return candidates and pick_best_handler(candidates, kinds) + picked = pick_best_handler(candidates, kinds=kinds) + if TRACE_DEEP: + logger.debug(f' get_best_handler: picked: {picked}') + return picked def get_handlers(location): @@ -177,6 +212,8 @@ def get_handlers(location): # default to False type_matched = handler.filetypes and any(t in ftype for t in handler.filetypes) + if TRACE_DEEP: + logger.debug(f' get_handlers: handler.filetypes={handler.filetypes}') mime_matched = handler.mimetypes and any(m in mtype for m in handler.mimetypes) exts = handler.extensions if exts: @@ -201,10 +238,18 @@ def score_handlers(handlers): Score candidate handlers. Higher score is better. """ for handler, type_matched, mime_matched, extension_matched in handlers: + if TRACE_DEEP: + logger.debug( + f' score_handlers: handler={handler}, ' + f'type_matched={type_matched}, ' + f'mime_matched={mime_matched}, ' + f'extension_matched={extension_matched}' + ) score = 0 # increment kind value: higher kinds numerical values are more # specific by design score += handler.kind + if TRACE_DEEP: logger.debug(f' score_handlers: score += handler.kind {score}') # increment score based on matched criteria if type_matched and mime_matched and extension_matched: @@ -255,6 +300,10 @@ def pick_best_handler(candidates, kinds): """ # sort by increasing scores scored = sorted(candidates, reverse=True) + + if TRACE_DEEP: + logger.debug(f' pick_best_handler: scored: {scored}') + if not scored: return @@ -994,7 +1043,7 @@ def try_to_extract(location, target_dir, extractor): strict=False ) -PatchHandler = Handler( +`PatchHandler = Handler( name='Patch', filetypes=('diff', 'patch',), mimetypes=('text/x-diff',), diff --git a/src/extractcode/cli.py b/src/extractcode/cli.py index 5591e5d..0f31ee1 100644 --- a/src/extractcode/cli.py +++ b/src/extractcode/cli.py @@ -103,14 +103,16 @@ class ExtractCommand(cliutils.BaseCommand): @click.option('--shallow', is_flag=True, default=False, help='Do not extract recursively nested archives (e.g. not archives in archives).') @click.option('--replace-originals', is_flag=True, default=False, help='Replace extracted archives by the extracted content.') @click.option('--ignore', default=[], multiple=True, help='Ignore files/directories following a glob-pattern.') +@click.option('--all-formats', is_flag=True, default=False, help='Extract archives from all known formats.') @click.help_option('-h', '--help') @click.option('--about', is_flag=True, is_eager=True, callback=print_about, help='Show information about ExtractCode and licensing and exit.') @click.option('--version', is_flag=True, is_eager=True, callback=print_version, help='Show the version and exit.') -def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, *args, **kwargs): # NOQA +def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, all_formats, *args, **kwargs): # NOQA """extract archives and compressed files found in the file or directory tree. Archives found inside an extracted archive are extracted recursively. + Use --shallow for a shallow extraction. Extraction for each archive is done in-place in a new directory named '-extract' created side-by-side with an archive. """ @@ -125,17 +127,26 @@ def extract_event(item): return '' if not item: return '' + source = item.source if not isinstance(source, str): source = toascii(source, translit=True).decode('utf-8', 'replace') + if verbose: if item.done: return '' - line = source and get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir) or '' + line = source and get_relative_path( + path=source, + len_base_path=len_base_path, + base_is_dir=base_is_dir, + ) or '' + else: line = source and fileutils.file_name(source) or '' + if not isinstance(line, str): line = toascii(line, translit=True).decode('utf-8', 'replace') + return 'Extracting: %(line)s' % locals() def display_extract_summary(): @@ -149,11 +160,19 @@ def display_extract_summary(): has_errors = has_errors or bool(xev.errors) has_warnings = has_warnings or bool(xev.warnings) source = fileutils.as_posixpath(xev.source) + if not isinstance(source, str): source = toascii(source, translit=True).decode('utf-8', 'replace') - source = get_relative_path(path=source, len_base_path=len_base_path, base_is_dir=base_is_dir) + + source = get_relative_path( + path=source, + len_base_path=len_base_path, + base_is_dir=base_is_dir, + ) + for e in xev.errors: echo_stderr('ERROR extracting: %(source)s: %(e)s' % locals(), fg='red') + for warn in xev.warnings: echo_stderr('WARNING extracting: %(source)s: %(warn)s' % locals(), fg='yellow') @@ -174,7 +193,12 @@ def display_extract_summary(): has_extract_errors = False extractibles = extract_archives( - abs_location, recurse=not shallow, replace_originals=replace_originals, ignore_pattern=ignore) + abs_location, + recurse=not shallow, + replace_originals=replace_originals, + ignore_pattern=ignore, + all_formats=all_formats, + ) if not quiet: echo_stderr('Extracting archives...', fg='green') diff --git a/src/extractcode/extract.py b/src/extractcode/extract.py index 4c4540b..e4634b6 100644 --- a/src/extractcode/extract.py +++ b/src/extractcode/extract.py @@ -123,9 +123,17 @@ def extract( if recurse and a nested archive is found, it is extracted to full depth first before resuming the file system walk. """ + + extract_events = extract_files( + location=location, + kinds=kinds, + recurse=recurse, + ignore_pattern=ignore_pattern, + ) + processed_events = [] processed_events_append = processed_events.append - for event in extract_files(location, kinds, recurse, ignore_pattern): + for event in extract_events: yield event if replace_originals: processed_events_append(event) @@ -155,7 +163,9 @@ def extract_files( Extract only archives of a kind listed in the `kinds` kind tuple. If `recurse` is True, extract recursively archives nested inside other - archives. If `recurse` is false, then do not extract further an already + archives. + + If `recurse` is false, then do not extract further an already extracted archive identified by the corresponding extract suffix location. """ ignored = partial(ignore.is_ignored, ignores=ignore.default_ignores, unignores={}) @@ -193,7 +203,11 @@ def extract_files( logger.debug('extract:target: %(target)r' % locals()) # extract proper - for xevent in extract_file(loc, target, kinds): + for xevent in extract_file( + location=loc, + target=target, + kinds=kinds, + ): if TRACE: logger.debug('extract:walk:extraction event: %(xevent)r' % locals()) yield xevent @@ -217,6 +231,7 @@ def extract_file( target, kinds=extractcode.default_kinds, verbose=False, + all_formats=False, ): """ Extract a single archive at `location` in the `target` directory if it is From d6f8044c73ebf75d254cdecdd501eca2fe110d9a Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Wed, 7 Apr 2021 00:01:28 +0200 Subject: [PATCH 07/42] Add support for VM image extraction #16 THis is a two step extraction using libguestfs to get a FS to a tarball which is then extractcode normally (hence dealing with links, device files and other permission oddities as a side effect). We support VDI (VirtualBox, VMDK (VMware) and QCOW2 (QEMU) Signed-off-by: Philippe Ombredanne --- src/extractcode/archive.py | 42 ++++++++++- src/extractcode/vmimage.py | 143 +++++++++++++++++++++++++++++++++++++ 2 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 src/extractcode/vmimage.py diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index 3b3a3d5..c64e255 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -41,6 +41,7 @@ from extractcode import libarchive2 from extractcode import patch from extractcode import sevenzip +from extractcode import vmimage from extractcode.uncompress import uncompress_gzip from extractcode.uncompress import uncompress_bzip2 @@ -469,6 +470,7 @@ def try_to_extract(location, target_dir, extractor): extract_xz = sevenzip.extract extract_lzma = sevenzip.extract extract_squashfs = sevenzip.extract +extract_vm_image = vmimage.extract extract_cab = sevenzip.extract extract_nsis = sevenzip.extract extract_ishield = sevenzip.extract @@ -1034,7 +1036,7 @@ def try_to_extract(location, target_dir, extractor): ) SquashfsHandler = Handler( - name='squashfs FS', + name='SquashFS disk image', filetypes=('squashfs',), mimetypes=(), extensions=(), @@ -1043,7 +1045,38 @@ def try_to_extract(location, target_dir, extractor): strict=False ) -`PatchHandler = Handler( +QCOWHandler = Handler( + # note that there are v1, v2 and v3 formats. + name='QEMU QCOW2 disk image', + filetypes=('qemu qcow2 image',), + mimetypes=('application/octet-stream',), + extensions=('.qcow2',), + kind=file_system, + extractors=[extract_vm_image, extract_tar], + strict=False, +) + +VMDKHandler = Handler( + name='VMDK disk image', + filetypes=('vmware4 disk image',), + mimetypes=('application/octet-stream',), + extensions=('.vmdk',), + kind=file_system, + extractors=[extract_vm_image, extract_tar], + strict=True, +) + +VirtualBoxHandler = Handler( + name='VirtualBox disk image', + filetypes=('virtualbox disk image',), + mimetypes=('application/octet-stream',), + extensions=('.vdi',), + kind=file_system, + extractors=[extract_vm_image, extract_tar], + strict=True, +) + +PatchHandler = Handler( name='Patch', filetypes=('diff', 'patch',), mimetypes=('text/x-diff',), @@ -1111,5 +1144,8 @@ def try_to_extract(location, target_dir, extractor): AppleDmgHandler, IsoImageHandler, SquashfsHandler, - PatchHandler + QCOWHandler, + VMDKHandler, + VirtualBoxHandler, + PatchHandler, ] diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py new file mode 100644 index 0000000..b4bbb04 --- /dev/null +++ b/src/extractcode/vmimage.py @@ -0,0 +1,143 @@ +# +# Copyright (c) nexB Inc. and others. +# SPDX-License-Identifier: Apache-2.0 +# +# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# ScanCode is a trademark of nexB Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import os +import shutil +import warnings + +from commoncode import command +from commoncode import fileutils +from commoncode.system import on_linux + +from extractcode import ExtractErrorFailedToExtract + +""" +Support to extract Virtual Machine image formats and the filesystem(s) they +contain. This is based on libguestfs-tools and is tested only on Linux. +Works only if libguestfs tool guestfish is in the path. + +See https://libguestfs.org/ + +On Ubuntu, you may face this issue when running guestfish: + +- https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 +- https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 +- https://unix.stackexchange.com/a/642914/185837 +""" + +logger = logging.getLogger(__name__) + +TRACE = False + +if TRACE: + import sys + logging.basicConfig(stream=sys.stdout) + logger.setLevel(logging.DEBUG) + +GUESTFISH_NOT_FOUND = ( + 'WARNING: guestfish executable is not installed. ' + 'Unable to extract virtual machine image: you need to install the ' + 'guestfish tool from libguestfs and extra FS drivers if needed. ' + 'See https://libguestfs.org/ for details.' +) + + +def get_command(): + """ + Return the location to the guestfish command or None. + """ + cmd_loc = shutil.which('guestfish') or None + if not cmd_loc: + warnings.warn(GUESTFISH_NOT_FOUND) + + return cmd_loc + + +def extract(location, target_dir): + """ + Extract all files from a guestfish-supported VM image archive file at + location in the target_dir directory as a tarball. + + Return a list of warning messages if any or an empty list. + Raise exception on errors. + + The extraction has a side effect to always create an intermediate tarball. + This tarball will be created as a temporary file and deleted on success. + + This works only on Linux. + """ + if not on_linux: + raise ExtractErrorFailedToExtract( + f'VM Image extraction only supported on Linux for: {location}') + + assert location + abs_location = os.path.abspath(os.path.expanduser(location)) + if not os.path.exists(abs_location): + raise ExtractErrorFailedToExtract( + f'The system cannot find the path specified: {abs_location}') + + assert target_dir + abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) + if not os.path.exists(abs_target_dir): + raise ExtractErrorFailedToExtract( + f'The system cannot find the target path specified: {target_dir}') + + cmd_loc = get_command() + if not cmd_loc: + raise ExtractErrorFailedToExtract(GUESTFISH_NOT_FOUND) + + supported_gfs_formats_by_extension = { + '.qcow2': 'qcow2', + '.vmdk': 'vmdk', + '.vdi': 'vdi', + } + extension = fileutils.file_extension(location) + image_format = supported_gfs_formats_by_extension.get(extension) + + if not image_format: + raise ExtractErrorFailedToExtract(f'Unsupported image format: {location}') + + filename = fileutils.file_name(location) + + target_tarball = os.path.join(target_dir, f'{filename}.tar.gz') + + args = [ + '--ro', + f'--format={image_format}', + '--inspector', + 'tar-out', + '--add' , location, + '/', target_tarball, + 'compress:gzip', + ] + + rc, stdout, stderr = command.execute2(cmd_loc=cmd_loc, args=args) + + if rc != 0: + if TRACE: + logger.debug( + f'extract: failure: {rc}\n' + f'stderr: {stderr}\n' + f'stdout: {stdout}\n') + error = f'{stdout}\n{stderr}' + raise ExtractErrorFailedToExtract(error) + + return [] From 4b519f8c804061f233add40b8f10bcd893c25e5c Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 20 Apr 2021 00:21:34 +0200 Subject: [PATCH 08/42] Remove empty READMEs from skeleton Signed-off-by: Philippe Ombredanne --- src/README.rst | 2 -- tests/README.rst | 2 -- 2 files changed, 4 deletions(-) delete mode 100644 src/README.rst delete mode 100644 tests/README.rst diff --git a/src/README.rst b/src/README.rst deleted file mode 100644 index ec651fc..0000000 --- a/src/README.rst +++ /dev/null @@ -1,2 +0,0 @@ -Put your Python source code (and installable data) in this directory. - diff --git a/tests/README.rst b/tests/README.rst deleted file mode 100644 index d94783e..0000000 --- a/tests/README.rst +++ /dev/null @@ -1,2 +0,0 @@ -Put your Python test modules in this directory. - From 1fbf4e3a2782d40e5e7da8c81ebeaed707b86603 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Thu, 22 Apr 2021 23:09:52 +0200 Subject: [PATCH 09/42] Complete support for VM image extraction #16 Signed-off-by: Philippe Ombredanne --- CHANGELOG.rst | 39 +- README.rst | 68 +++- azure-pipelines.yml | 7 +- setup.cfg | 1 + src/extractcode/archive.py | 15 +- src/extractcode/extract.py | 4 +- src/extractcode/vmimage.py | 344 ++++++++++++++---- tests/data/vmimage/CHANGELOG.rst | 36 ++ .../bios-tables-test.x86_64.iso.qcow2.tar.gz | Bin 0 -> 10523 bytes ...-tables-test.x86_64.iso.qcow2.tar.gz.ABOUT | 2 + tests/data/vmimage/foobar.qcow2.tar.gz | Bin 0 -> 1383 bytes tests/test_archive.py | 13 + tests/test_vmimage.py | 59 +++ 13 files changed, 474 insertions(+), 114 deletions(-) create mode 100644 tests/data/vmimage/CHANGELOG.rst create mode 100644 tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz create mode 100644 tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz.ABOUT create mode 100644 tests/data/vmimage/foobar.qcow2.tar.gz create mode 100644 tests/test_vmimage.py diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 39a0096..e7c766b 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,36 +1,13 @@ -Release notes -============= +Changelog +========= -vNext ------ +v (next) +-------------- + - Add support for VMDK, QCOW and VDI VM image filesystems extraction -Version 21.1.21 ---------------- -- Bump dependencies and use latest typecode and binaries. This is to fix - installation problems on multiple OSes. +v20.10 +------ - -Version 21.1.21 ---------------- - -- Add new [full] extra requires that install all the dependencies -- Fix bug related to commoncode libraries loading -- Improve the extra requirements -- Set minimum version for dependencies -- Improve documentation - - -Version 21.1.15 ---------------- - -- Drop support for Python 2 -- Use the latest CommonCode and TypeCode libraries -- Add azure-pipelines CI support - - -Version 20.10 -------------- - -- Initial release. + - Initial release as a split from ScanCode toolkit diff --git a/README.rst b/README.rst index 4c15702..89354cb 100644 --- a/README.rst +++ b/README.rst @@ -7,17 +7,26 @@ ExtractCode - keywords: archive, extraction, libarchive, 7zip, scancode-toolkit -ExtractCode is a universal archive extractor. It uses behind the scenes -the Python standard library, a custom ctypes binding to libarchive and -the 7zip command line to extract a large number of common and -less common archives and compressed files. It tries to extract things -in the same way on all OSes, including auto-renaming files that would -not have valid names on certain filesystems or when there are multiple -copies of the same path in a given archive. +ExtractCode is a universal archive extractor. It uses behind the scenes +multiple tools such as: + +- the Python standard library, +- a custom ctypes binding to libarchive, +- the 7zip command line +- optionally libguestfs on Linux + +With these it is possible to extract a large number of common and + +less common archives and compressed files. ExtractCode tries to extract things +in the same way on all OSes, including auto-renaming files that would not have +valid names on certain filesystems or when there are multiple copies of the same +path in a given archive (which is possible in a tar). + The extraction is driven from a "voting" system that considers the -file extension(s) and name, the file type and mime type (using a ctypes +file extension(s) and name, the filetype and mimetype (using a ctypes binding to libmagic) to select the most appropriate extractor or -uncompressor function. It can handle multi-level archives such as tar.gz. +decompressor function. It can handle multi-level archives such as tar.gz and +can extract recursively nested archives. @@ -36,3 +45,44 @@ To clean up development environment:: ./configure --clean +To run the command line tool in the activated environment:: + + ./extractcode -h + + +Adding support for VM images +---------------------------- + +Adding support for VM images requires the manual installation of libguestfs and +it Python binding. You will need to install the libguestfs tools system package. +On Debian and Ubuntu:: + + sudo apt-get install libguestfs-tools + + +On Ubuntu, a manual stpe is required if the kernel executable file cannot be read. +This is required by guestfish and libguestfs and this is an oddity there and not on Debian. + +Run this command as a temporary fix:: + + for k in /boot/vmlinuz-* + do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$(uname -r) + done + +or:: + + sudo chmod +r /boot/vmlinuz-*, + + +For a permanent fix see: + + - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662/comments/21 + +See also for a discussion: + + - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 + - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 + - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 + + + diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9a4c950..2dfc582 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -21,7 +21,9 @@ jobs: image_name: ubuntu-18.04 python_versions: ['3.6', '3.7', '3.8', '3.9'] test_suites: - all: tmp/bin/pytest -n 2 -vvs + all: + - apt-get install libguestfs-tools + - tmp/bin/pytest -n 2 -vvs - template: etc/ci/azure-linux.yml parameters: @@ -29,7 +31,8 @@ jobs: image_name: ubuntu-20.04 python_versions: ['3.6', '3.7', '3.8', '3.9'] test_suites: - all: tmp/bin/pytest -n 2 -vvs + all: + - tmp/bin/pytest -n 2 -vvs - template: etc/ci/azure-mac.yml parameters: diff --git a/setup.cfg b/setup.cfg index 0c05301..f647582 100644 --- a/setup.cfg +++ b/setup.cfg @@ -35,6 +35,7 @@ packages = find: include_package_data = true zip_safe = false install_requires = + attrs >= 18.1, !=20.1.0 commoncode >= 21.1.21 plugincode >= 21.1.21 typecode >= 21.2.23 diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index c64e255..a396f08 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -221,10 +221,11 @@ def get_handlers(location): extension_matched = exts and location.lower().endswith(exts) if TRACE_DEEP: - logger.debug(' get_handlers: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals()) + print(f' get_handlers: matched type: {type_matched}, mime: {mime_matched}, ext: {extension_matched}' % locals()) - if handler.strict and not all([type_matched, mime_matched, extension_matched]): - logger.debug(' get_handlers: skip strict' % locals()) + if handler.strict and not (type_matched and mime_matched and extension_matched): + if TRACE_DEEP: + print(f' get_handlers: skip strict: {handler.name}') continue if type_matched or mime_matched or extension_matched: @@ -1052,8 +1053,8 @@ def try_to_extract(location, target_dir, extractor): mimetypes=('application/octet-stream',), extensions=('.qcow2',), kind=file_system, - extractors=[extract_vm_image, extract_tar], - strict=False, + extractors=[extract_vm_image], + strict=True, ) VMDKHandler = Handler( @@ -1062,7 +1063,7 @@ def try_to_extract(location, target_dir, extractor): mimetypes=('application/octet-stream',), extensions=('.vmdk',), kind=file_system, - extractors=[extract_vm_image, extract_tar], + extractors=[extract_vm_image], strict=True, ) @@ -1072,7 +1073,7 @@ def try_to_extract(location, target_dir, extractor): mimetypes=('application/octet-stream',), extensions=('.vdi',), kind=file_system, - extractors=[extract_vm_image, extract_tar], + extractors=[extract_vm_image], strict=True, ) diff --git a/src/extractcode/extract.py b/src/extractcode/extract.py index e4634b6..bb93203 100644 --- a/src/extractcode/extract.py +++ b/src/extractcode/extract.py @@ -30,7 +30,7 @@ from commoncode import fileutils from commoncode import ignore -import extractcode +import extractcode # NOQA import extractcode.archive logger = logging.getLogger(__name__) @@ -61,7 +61,7 @@ - Symlinks may be replaced by plain file copies as if they were regular files. Hardlinks may be recreated as regular files, not as hardlinks to the original - file. + files. - Files and directories may be renamed when their name is a duplicate. And a name may be considered a duplicate ignore upper and lower case mixes even diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index b4bbb04..2379b8a 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -20,11 +20,14 @@ import logging import os +import pathlib import shutil import warnings -from commoncode import command +import attr + from commoncode import fileutils +from commoncode.text import as_unicode from commoncode.system import on_linux from extractcode import ExtractErrorFailedToExtract @@ -59,85 +62,300 @@ 'See https://libguestfs.org/ for details.' ) +EXTRACTCODE_GUESTFISH_PATH_ENVVAR = 'EXTRACTCODE_GUESTFISH_PATH' + -def get_command(): +def get_command(env_var=EXTRACTCODE_GUESTFISH_PATH_ENVVAR, command='guestfish'): """ Return the location to the guestfish command or None. """ - cmd_loc = shutil.which('guestfish') or None + cmd_loc = os.environ.get(env_var, None) + if cmd_loc and os.path.exists(cmd_loc): + return cmd_loc + + cmd_loc = shutil.which(command) or None if not cmd_loc: warnings.warn(GUESTFISH_NOT_FOUND) - return cmd_loc -def extract(location, target_dir): +def check_linux_kernel_is_readable(): + """ + Return True if the kernel executable file can be read. This is required by + guestfish and libguestfs and this is an oddity mostly on Ubuntu. + + See: + - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 + - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 + - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 + """ + error = ( + 'libguestfs requires the kernel executable to be readable. ' + 'This is the case on most Linux distribution except on Ubuntu.\n' + 'Run this command as a temporary fix:\n' + ' for k in /boot/vmlinuz-*\n' + ' do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$(uname -r)\n' + ' done\n' + 'or:\n' + ' sudo chmod +r /boot/vmlinuz-*\n\n', + 'For a permanent fix see: ' + 'https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662/comments/21' + ) + if on_linux: + kernels = list(pathlib.Path('/boot').glob('vmlinuz-*')) + if not kernels: + raise ExtractErrorFailedToExtract(error) + for kern in kernels: + if not os.access(kern, os.R_OK): + raise ExtractErrorFailedToExtract( + f'Unable to read kernel at: {kern}.\n{error}') + + +@attr.s +class VmImage: + location = attr.ib() + image_format = attr.ib() + guestfish_command = attr.ib() + + @classmethod + def from_file(cls, location): + """ + Build a new VMImage from the file at location. + Raise excptions on errors. + """ + if not on_linux: + raise ExtractErrorFailedToExtract('VM Image extraction only supported on Linux.') + + check_linux_kernel_is_readable() + + assert location + abs_location = os.path.abspath(os.path.expanduser(location)) + + if not os.path.exists(abs_location): + raise ExtractErrorFailedToExtract( + f'The system cannot find the path specified: {abs_location}') + + supported_gfs_formats_by_extension = { + '.qcow2': 'qcow2', + '.vmdk': 'vmdk', + '.vdi': 'vdi', + } + + extension = fileutils.file_extension(location) + image_format = supported_gfs_formats_by_extension.get(extension) + + if not image_format: + raise ExtractErrorFailedToExtract(f'Unsupported VM image format: {location}') + + cmd_loc = get_command() + if not cmd_loc: + raise ExtractErrorFailedToExtract(GUESTFISH_NOT_FOUND) + + return cls( + location=location, + image_format=image_format, + guestfish_command=cmd_loc, + ) + + def listfs(self, skip_partitions=('swap',)): + """ + Return a list of (filesystem /partition/ device path, filesystem type) for each + filesystem found in this image . + + We run guestfish for this: + $ guestfish --ro add foo.qcow2 : run : list-filesystems + /partition/sda1: ext4 + """ + args = [ + '--ro', + f'--format={self.image_format}', + '--add' , self.location, + 'run', + ':', 'list-filesystems', + ] + stdout = self.run_guestfish(args) + + filesystems = [] + entries = stdout.strip().splitlines(False) + for entry in entries: + entry = entry.strip() + if not entry: + continue + if ':' in entry: + partition, _, fstype = entry.partition(':') + fstype = fstype.strip() + else: + partition = entry + fstype = None + + if any(s in partition for s in skip_partitions): + continue + + filesystems.append((partition, fstype,)) + + return filesystems + + def extract_image(self, target_tarball): + """ + Extract all files from this VM image in the `target_tarball` file as a + gzipped-compressed tarball (.tar.gz). Raise exception on errors. + """ + args = [ + '--ro', + '--inspector', + f'--format={self.image_format}', + '--add', self.location, + 'tar-out', '/', target_tarball, 'compress:gzip', + ] + + self.run_guestfish(args) + + def extract_partition(self, partition, target_tarball): + """ + Extract all files from a single partition of this VM image to the + `target_tarball` file as a gzipped-compressed tarball (.tar.gz). Raise + exception on errors. + """ + # TODO: there could be devices/partitions we do not want to extract? + # guestfish --ro add foo.qcow2 : run : mount /dev/sda1 / : tar-out /etc foo.tgz compress:gzip + + args = [ + '--ro', + f'--format={self.image_format}', + '--add', self.location, + 'run', + ':', 'mount', partition, '/', + ':', 'tar-out', '/', target_tarball, 'compress:gzip', + ] + self.run_guestfish(args) + + def run_guestfish(self, args, timeout=None): + """ + Run guestfish with `args` arguments. + Return stdout as unicode string. Raise exception on error + """ + import subprocess + full_args = [self.guestfish_command] + args + try: + stdout = subprocess.check_output(full_args, timeout=timeout, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as cpe: + args = ' '.join([self.guestfish_command] + args) + output = as_unicode(cpe.output) + error = f'Failed to run guestfish to extract VM image: {args}\noutput: {output}' + raise ExtractErrorFailedToExtract(error) # from cpe + + return as_unicode(stdout) + + +def extract(location, target_dir, as_tarballs=True): """ Extract all files from a guestfish-supported VM image archive file at - location in the target_dir directory as a tarball. + location in the target_dir directory. Optionally only extract the + intermediate tarballs if `as_tarball` is True. Otherwise, extract to + intermediate tarballs and then extract each tarballs to the final directory. Return a list of warning messages if any or an empty list. Raise exception on errors. - - The extraction has a side effect to always create an intermediate tarball. - This tarball will be created as a temporary file and deleted on success. - This works only on Linux. """ - if not on_linux: - raise ExtractErrorFailedToExtract( - f'VM Image extraction only supported on Linux for: {location}') - - assert location - abs_location = os.path.abspath(os.path.expanduser(location)) - if not os.path.exists(abs_location): - raise ExtractErrorFailedToExtract( - f'The system cannot find the path specified: {abs_location}') - assert target_dir abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) - if not os.path.exists(abs_target_dir): + if not os.path.exists(abs_target_dir) or not os.path.isdir(abs_target_dir): raise ExtractErrorFailedToExtract( - f'The system cannot find the target path specified: {target_dir}') + f'The system cannot find the target directory path specified: {target_dir}') - cmd_loc = get_command() - if not cmd_loc: - raise ExtractErrorFailedToExtract(GUESTFISH_NOT_FOUND) - - supported_gfs_formats_by_extension = { - '.qcow2': 'qcow2', - '.vmdk': 'vmdk', - '.vdi': 'vdi', - } - extension = fileutils.file_extension(location) - image_format = supported_gfs_formats_by_extension.get(extension) - - if not image_format: - raise ExtractErrorFailedToExtract(f'Unsupported image format: {location}') - - filename = fileutils.file_name(location) - - target_tarball = os.path.join(target_dir, f'{filename}.tar.gz') - - args = [ - '--ro', - f'--format={image_format}', - '--inspector', - 'tar-out', - '--add' , location, - '/', target_tarball, - 'compress:gzip', - ] - - rc, stdout, stderr = command.execute2(cmd_loc=cmd_loc, args=args) - - if rc != 0: - if TRACE: - logger.debug( - f'extract: failure: {rc}\n' - f'stderr: {stderr}\n' - f'stdout: {stdout}\n') - error = f'{stdout}\n{stderr}' - raise ExtractErrorFailedToExtract(error) - - return [] + vmimage = VmImage.from_file(location) + + warnings = [] + + filename = fileutils.file_name(vmimage.location) + + # try a plain extract first + try: + + if not as_tarballs: + intermediate_dir = fileutils.get_temp_dir(prefix='extractcode-vmimage') + tdir = intermediate_dir + else: + tdir = target_dir + + target_tarball = os.path.join(tdir, f'{filename}.tar.gz') + vmimage.extract_image(target_tarball=target_tarball) + + if not as_tarballs: + # extract the temp tarball to the final location + warns = extract_image_tarball( + tarball=target_tarball, + target_dir=target_dir, + skip_symlinks=False) + warnings.extend(warns) + + except ExtractErrorFailedToExtract as e: + print('Cannot extract VM Image filesystems as a single file tree.') + + warnings.append(f'Cannot extract VM Image filesystems as a single file tree:\n{e}') + # fall back to file system extraction, one partition at a time + partitions = vmimage.listfs() + if not partitions: + raise + + if len(partitions) == 1: + # we can safely extract this to a root / dir as we have only one partition + partition, _parttype = partitions[0] + if not as_tarballs: + intermediate_dir = fileutils.get_temp_dir(prefix='extractcode-vmimage') + tdir = intermediate_dir + else: + tdir = target_dir + + target_tarball = os.path.join(tdir, f'{filename}.tar.gz') + vmimage.extract_partition(partition=partition, target_tarball=target_tarball) + + if not as_tarballs: + # extract the temp tarball to the final location + warns = extract_image_tarball( + tarball=target_tarball, + target_dir=target_dir, + skip_symlinks=False) + warnings.extend(warns) + else: + # with multiple partitions, we extract each partition to a unique + # base name based after the partition device name + + for partition, _parttype in partitions: + base_name = partition.replace('/', '-') + + if not as_tarballs: + intermediate_dir = fileutils.get_temp_dir(prefix='extractcode-vmimage') + tdir = intermediate_dir + else: + tdir = target_dir + + partition_tarball = os.path.join(tdir, f'{filename}-{base_name}.tar.gz') + vmimage.extract_partition(partition=partition, target_tarball=partition_tarball) + + if not as_tarballs: + # extract the temp tarball to the final location + # which is a new subdirectory + partition_target_dir = os.path.join(target_dir, base_name) + fileutils.create_dir(partition_target_dir) + warns = extract_image_tarball( + tarball=target_tarball, + target_dir=target_dir, + skip_symlinks=False) + warnings.extend(warns) + + return warnings + + +def extract_image_tarball(tarball, target_dir, skip_symlinks=False): + """ + Extract an intermediate image tarball to its final directory. + Return a list of warning messages + """ + from extractcode.libarchive2 import extract + return extract( + location=tarball, + target_dir=target_dir, + skip_symlinks=skip_symlinks, + ) diff --git a/tests/data/vmimage/CHANGELOG.rst b/tests/data/vmimage/CHANGELOG.rst new file mode 100644 index 0000000..39a0096 --- /dev/null +++ b/tests/data/vmimage/CHANGELOG.rst @@ -0,0 +1,36 @@ +Release notes +============= + +vNext +----- + + +Version 21.1.21 +--------------- + +- Bump dependencies and use latest typecode and binaries. This is to fix + installation problems on multiple OSes. + + +Version 21.1.21 +--------------- + +- Add new [full] extra requires that install all the dependencies +- Fix bug related to commoncode libraries loading +- Improve the extra requirements +- Set minimum version for dependencies +- Improve documentation + + +Version 21.1.15 +--------------- + +- Drop support for Python 2 +- Use the latest CommonCode and TypeCode libraries +- Add azure-pipelines CI support + + +Version 20.10 +------------- + +- Initial release. diff --git a/tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz b/tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..bc946cb6c54731e3830c899ecb463ff8b06f2f18 GIT binary patch literal 10523 zcmV+$Ddg54iwFP!zI|W-1MPYRR9wxLZbRdA;~u0T5Fl9Np5QKxg#-vR8eAI-+E{Q2 zZh>Gy1B5_uXx!Zi1cE0JJOqL~{>y)7&6@XS?p<%*o4aP#{Z60#op0}|Q&nB3YoA)H z&1{_9xDcjh_LjF~>4xC;5)m>H65zIRbK-U}ck<)|{O19MLWP6`Ah*mbEC~JA{0I2? zZViN20LsrVbX!nB7y{*k@(T$8AkhB@ZU0TUyCFBmHS)qyTL&I zh5wsCOF{YfEdl?H8}~03EAX!xd`DP+O7uSr+TYm(#_efs4g95l%k2Pwg^er18)9aI z`1@+3nLD|+g7ZPOQaFn1rI4?qQEmz0x zdJ`oTZyhct&1@`~;oS|`l};39r(zXE@dsKPmiI8C;GBglmae%>{j^M3Q`Bih`nj&S zo|Qf}JI_5RW9K?_N0`m=S~CU;6wE6QFE+}Lzaerjy!fg4ls|pH;BmG) zg)#kM_R+G3XNzCF06H)#OavAv`1W1a-1#q~M4W2j91>} z2!#?!g)158Apr1uf7c8S@9Tr&DLlBXXAuv-g#{JnebPS^038=gAxsO(ga*(ti-!Y% zxL5>4-6+@yQ@=yc0II9K9Y-!2q!OYFrGY9*>ncghQNbTlL3|&vYTm5;B2r|P6z(Om zlf|OLQVxHqEI|79TSM06mF!6t%2pB00S#d8I~qt@u7dx38qG=&s z&nPTj@sx+FIs_RJnt+TzmLONL?+2p?1A*w)LM^S?97v$!) zv$pXxTE$EtNq$su`8Lf{kKlE+T*FV+%jcf;&)&%J@kNmbvhlRZF={cgKW<~@WoOl6 zWT_D`)Rbm~unJa-=rQtFiqzuBKjLC-%gxnhZQO35Sc22~Ro|F@#tFfcnCkm~2E+$uG>PFl8A4fJvP|AR1PftPB5@)rGY611zGk z$xar+6f5VWGeQ*i=nxbS>+%0q%<8V#C`O9sZLtU_zm~K6vF`0%O`WT5ZyZbu0364d zLGB5}SCSK;8&OkfFQjR8J%VoYI`i{FnbY{7B2eirT?=A+B3PgqVg()w>yVYETY2j1r?2|}35M|4+yo>L*ooGXH^mmtUi0?x*opNPQ+)My@72Uk8$%B? zI;KpQbwek)DWyjF|LbndpX zm6KzWj-E)ep%PmM+dQ^!Bkr_KVAsCtF9{8#`dPOV8qs!AX(EPJ$kd?HG0EHG7$RUL z(fjd-#KUKS>g{%H0rh5cc#A|f_NTJzE~AvH!pY@>+u>Hyt17s=gA190i2!>}z_L#8 zm-zRuxU-_Sgg-)Bbs9u{$z4d&ak-S}$Q6|=4cGh68*7&LD3x+v0m}N5(=F{HN?pr` z!Z=P$2x&+C0ng*BRnfI*djnYeYN;^5Ptvf2afl1%S6~R29vuywA=^?EyY=@GN!@K? zMJ^3#9dlXLC;^-<uy{3}ZiIC*@^uKF;os47P791%EmFect>fL1wX%Ch}v2wtNPh z2@v0rf3s5cV4r2DteNA*MmF=+KIs?n0<$5V2gM}I`-ql>F#5XK>Id9#u24a< zUo^*na)}iQZ9$$Q0JJpV>GKU=c4|zeB(cUy<0MC-DDSGHUAGk|aK}qLj^m{v<0~CG z4VGYP1t^@*q};F|+?)(m#933uJ76^SwtO+kFZ1hpewyfrcO|zqt5Q=O$ObK_UR=&4 zR{`%)!HPUid7Qw`_i7KY!YE4vu5Ubd zF4rf|kGrjk8$)@@Zmw=__UdJDo1DHUlKh^{l8S6&0kdt7x5#gx1INPGxE}ITLE7_A z4I^AUqXtMIi^n(An%Ad_5>$`95Hh;WDN4 z%@d%?f}K`q&_T%D@XIh+l$y%ecr#F>3hkRz@AGIFYh|D)wKZE`+?(iL$?Kfdb%Z^S=L9)Bbo15qe7{+1N)G zxr!K3pswJOWiF*G{&u2y4SMm&sQUV{z?e>2LzE~1S(J?>$C!&Qj;QT`Xzp=Uk$0}9 zY>kwnJ=5=K3vi$QBjru6a*HOKad&!al?KV{Z$MexPZc^qkj(J=CWY<72KMI81RBQQ(X00vH^GmNRQIB!8S}ezq%syB5d8V+U z^Ny6~nzde1Vi_>LrgvYtEIOu1z{tJ4chXtXbV+y5z`A(K2qmlb+U+ghb1SSUgJJXB zcn99z49vGmG$$%QW@u!WXFHRb z6`0Xm&|Z=@DDfgOsV-Tu^Ld-rXnjKO>1`@Az(cPFVgi}vWILd5w zk%&bZ`QoVv_XYVmMt4`RFO)=V*PA6ah*VMD-?RPrP?CA$JtDSv9M~`CyLb`JG;Erh z(JAMgW;6YhH)07{l)Qa7A^)_>|D8dbV_1e;()J4*3A|Pn*<`17sP5K2>ZhReIY9e` z+sU(<5vFha0ehGXOoV&w{Kj<`7Z`HU#p9S`b-qKHpUk#5G3~%NA_#AkaYU4YN>$|J znE1cUM3{*nJFi2_`z2a}9XKj5I6o9aqu%TlUMv#r2=MBK0SpKONK6vL`@>8_>B+8`Ru|DNDHTSd3gX0dmug)z9 zk=1uiwu(c)vryeXGhbT8d3(NY4%q4zBE#xl5dIF*s$TiRJw}~p*51YyNUdad;N%8h zDC}u(AKS^xJNP;^dvjyLjddhp@Fhj%C1PJ|2jJ2;FPRbNcDokWSBgqBKBtz*h;#M& zNMEdbI(N!|{!NmM(EH@&)tU8m3GDs%@xijP{`Qs0Y&Wg7+FZqB!WmcPIAbm4sAy>9`@dfaCGyd_E)+iJCrGtfolr2EjL#^*dSVdkuAIZqwE~ePQDy z%zkF`zGi~#Fn>I_)0G)?SydsCwImf!nWTxmZ=>T9&_#UsSBG+tKIh0dz#$Z-LWAzr^(r zEVa;k6hzo|j2Sm_uQXas%JFuJu^~2wSdh{D8u<-iygtp=xTL%#X8Z>FR2@cv`tfVU zEzH~Gk1auEsOtfcZ>7^`6Lgx(KSJ-bVjDl&Ue4nK7E~QyYhcFrjL=t2Ma41v@wyS_ zEFn(VX5}H+ytm-`y3LXa)?T}z96|k=fH{%e&(JBg4?_K|zioerJ6_S46F`F_fU_=B zHCX#DP!fwDb29z1LKq3kMP71IwlgA~OVIkH0>Y6z!oe)#>U+|Br;s6C*a0t_6gp_V zS{oSeQ){a5+v81OXfPp^)<4kR`~3PQAjc60+d8vsl_h=)=u71Tc_1Vk{9%gLB8m93 z_|u^F>T|T?sPui*d;eiH!H8jE&aHXx-s(f#K8Hs$Qape^QoYjcUZQv37n7#%o&zpM zWl2y*=(hAsiD04K3}0tu>BqIGcA@BBZ9j8fOY3GoIJc)&Je| z1^3x+pSJxi|5rpR3frxUtwb^+f$`t$%|i^dLi8t9$MzUfOjWMg%mBn z|Ehg|j)3#m+lO3ZY3khT&Y2k9gu{M75%JfH)*j+ni><2!ol^pR&$R9fw^D}PYdAnl zGxG+~%!+?)=oCU8W9ubIDgmX5LU}@&WMEZVV??Wwfk7CYBrs?mAFBHSGqKt0k(rKK z{jW$_5q#yha8suj>E8|n7(;2uyBNkiPQRL%lF7}?>)~fE2{(26d>L`13Df#H-f_Sk zo4QKV>EM=bd$1Ham7i4pONYNX1gTf^{(arb)#P!FnevyGt;ADKQ&n z-opI{Y+$d~=*>z{#M?0(+?jaS=~ZO4)pT!D`qIRrynkIlQr!`LdW2-$kVei|HbCWr z&f23ivF8vHHrf!r2Hz*5@>G0oDf70BnATdj8Q!UUu_Vx0S~uw^ZR$hQHrx12`vA${rGp}5ANqu? zPTXbJKC8wZ%W(diPOw@gpR|0zo8N-jhuGr<&rLxl%Ec zRy6V5aqraw(=Ri?*lbGGdPy-&D&wQe>+xzGK#v6NZEzv~Jf4$SR%*%CI*<8#+=$foAmevQrfbN*ND?F#>LFN%uj^_lO!@6=8?WwRqQ<7 zsG>{kbTbfNH>QQkyIo0dh!0Sv_nCJZfe&s~Yz>4i1wtnS_sJ%e5PL=$h|pt1oOZhR z%OKRTcRpb6G#;ZD<8SMB0>@v`b<&F8h=z%Ug}g-ghl(w(h7xzt%EuXR-OyJEt4Q^n z$VPi(^I0+UrIkR;Z*%j9d-PrnLBwqHzcQr*k*IgGirqnN6;y0rZGZjzy?S#Asb0T{ zz0`?&N$WejHo23Zx7zP@);&)NQI+s9M^O7VBF$J-0DAFlsxn$+y_NAuczseG6*+-{ zs$<=`(zZl>7!g(S<8W)65-FTusWI&5>oJ07CURVo831<;4+h0O&>5jL>9_{JOnWZz`E;3Sd{f#&cY@{;3K$Jn7CB69G-{&zHuMN}nWXta}8N&p6 z$B{(n&h)wEKW20F6SyqJ^it=eb~#jAUON+g6`5I=8RRxpDFJuwGKGD}2p{U-%}{0Q zvth^KSUN!ufR^pJAzpZOmn%rqMVaM6?K1sR=~ZeS8>6e@G(f)#UP;=+(>j{6KWM`C zYNqsoVdJz7AdT_Q?xaBmaQ(~orn#S_&sCfby8?xNR{F`-4b{&-cA#2FnH+oI?w~$P zVBRJgaY_2g8?x+~J8g0$#j6Q;O8%j6<)u-ivJ1PGQ3Gwqs22Q;c=?*xj4OdtrjA@{ zgv51DxQakemcB9^BO$2{1*Yhl!(iyc13?8$`BI4UUR6^MqE|5mTtO-Df{(JWT4%RS zJbH_-0UX4K+vHcw1MMsG-sY@8ploN<@aXl;+3iERGC6p0kL}0!U?z4!o+mXwKeU+) zsiMH})7n3Ob$pL(b-ce(It;lgPYf&CkcxP%ShME~$$6`bZXK9NH3tZQ)T%PD{rWmE zd2<6M1gMXR(As<97UwBom7A52#oPMluZ$0W!z4R@j6xE#jd}kH73g|O7&o(+Ji;?0 z%Z02*aD`wWA<;e%K0@b0qjA7@Y2oj4)r{X*l{%z+_oid%+=OOxl#2Ts znaiDV@B0_dUTKP34e+>;K?tQslyQ;;+`2`#!-;xmwHnxmg$!eIj~g+h&t@?3wZIak z9V)j7<%28zeVad@)bY<(5k1mR?I|eZkUp$0~S8SzMZ_hrKP5x{RS<(!=^Fo6scd7ybz; zR7q0;xzqamAsWE_uTT9LFj$SUJ0y2i-tk=M6&3>tr(R2xMIRG96>czD8*7 zE-%WMG#c2nwa`Q~mfkyGwTbx?VECL`bTcOVtZ`|++w`&qR(sAUc*Qq_5iVG~Az-zWM*MT#ie zHpCErk(8VECN3c3vZs>kkd=fX$*ZyJvC}1r)q4lK@|0@`WKlGYld$l%g0DICXUcb4 z!X97;*7KF{3@t&gUDU$CvFJz*^^y8Ftf=$S`ZC!YEdR~a?0Wfzj&qq{$Pt7Y1{sgJ4D z?)HxAt;GsAhd24Lh|JRd6Gxj{dkYE=V^E* z{4a053_oNUX&>wVJbZbD9&bSoIGPa4em2pV{jAj_uGAe^Qm;MSq%>eMJXCcZw^<0F!Ys?O>>H}| z3-J!OnF3oyPMU>Vmieh~pxEs$NV(bf7ooi#F|1WJq~v}`dOm)5n+bZg#5Kc#^)*WC zKL_Qo$2R^?YcVQ|_2;u|9S;8%*@ zZ&bD@a~LHGv)hRry$*>xNYM^FsE&0Nefe4UDywV#((LMMp2gKf(AU>_Q(Z<^ z9aEpRuktQGn_W#M+28m3k~9^#i}1@CD#+`~Q@I+O`lyyVRWO0b4RrH=6JQhZD7q|l zXhk<8CB;CFt0{{lN(UIpcdQbKegO@8VXSPcWo$+;R(CLz^csYD57QhU3;Zdo>%G|zQ2YdE zYdlx&gpv(wHt+=Z5Pt3!kHk|O_Tw= z#o14nUMnEV+&}Hmn4R5$Hp*GM9SdRquFf)`2G%Zc+)1XeB>Le3t8l{4f5>5h11lmR zuC)`P?TGH|-gsPYs=A=B;zw|?6iqmpqv$K_DwO^P+?%+=G~r7{99M}G)*#~jE)+mD zdYsTxteEn<^tIophth)+DbqL6e!R~4CQT1YIZ6r>&FnkEekw2#KD^JN!qe!Hp>~7y4JAcJw~EU@C8n`POXpG znt-}%AIjK=MYY#M2@fN`?7 z^FMSW(~)=@12um@QE3nCh-A+tu>j#4WbWFQ-^w+ob;y&f+x%`gj*8za%szJV zUs_&@V?9O8kjWqNnK-7fkbTcOD7m^U{Sd+c^V8L5k&uvLS-(yi`TWfn3Wq zHo8Swzrx`_lYE)}RRL38%WC3YdiJ>mK4GCO>62e0nwf^_S}J9!!;4|WjUP3ls=&(j z)c0qZ_m9LCHr-xpI>_78I0u07CGtMS8l7Ud*3kTtR{GxH`MQ^offi@@v9#ik@}IO* z{ynpJ1Ac>~5u`t&C_7~1Qa_aTzWU++6VD)5YsFtmF%Zq!nqeklHK*2ayq)M#g<&9} zwVae4xBIo({5G@EN45JCzhN&w7WaYgi<(o@VULFT+jEH>B_%a-6_%#XAzu>6cLkSD z%4=&WKBrE460^$zb(mA%`psiHFPsfiwd;g}erS$!cQ5m07<<^_*{IsQ^{J|wq(XJs zK2X~GD#Ef@nbXX?>s`u|a=PQwXBJx9rpGnJSQEqH{7NCVt|y3AGzJOvMt=s*%TSyy*ThE}PAl3T+T*FrMkw7_4E@4@&}ZmOJPdf*l_m1o7xJ>n zBMm4qgf)R?7+#sr_gLoi){niPARY-)mwV^;8@H8A6~Yr)hF!cekjY2ia)d_jSy_$o z=ktH?d++(vpk#*N`-M{-{Txk;=k(Q$Wj z*I7DF$l$ZTf2Ph7Sk)Z&+qF2|{lLDdfMZeoUZ3Hl-K2J&@n?y%myj=+n$ZOI>hKr% zY95$Vd*tQ*?=)|eBp2=40&0R>U6j<24=O{v-aHe$=nWe&=g<0Xw zKZ}v$b(B1eO>+*1Bgxoh0^UD3ULMqk}u@yMeO-jWA<+A5C;_|Dh$Mu%70~l$Xl$k8u?*{i`W5OLk!;L&t*@UM$sR6 zCCRyHxP1;^r~C$*^CGcVSaXNcI(7IT&udMEXT* zhuX~fKErF>x@?i?iXd+ZPt&{-G75uPw2WC7)EUYx%z*m&XTqak%}|va_`Gi7<-|ZF zQ-^{6vl6}hqxblg3FT^zeG9^DyU#22xD}XVKRI*x?O+!6RZGU#k^L4oFNi}9Aaazl zZdiCye_V{Yq@-j7flE1MBi%#lm^Lo_gB84rnN7b*LkR!y z8LH7G%%SW))UP(Gs*bCqrpYxkrmLhnG^DJ)Y6esPsO>y3#6zI0HlWN!pv=Wxt;Yq! zd8ok-6$q@3mf{SbSozGHYj$5Yo-D)xlrS~gH>y58s;qXqkLBT3nc?AL<7QV;ADZS- z8|IqkLCY{KH_6CIDZw}Uma@LJMfP1h*=n<3`xUyp^v|%sZX=^?nWeR*>}++;llwzz zFiYTv$x|)5s1Z!NSx-)`eoxKeNmVPUWUDmDXnWU6dtdv?N*7_z4N@w{`&8W*gf~!4wRgn8u>yNP(P7&2;)1byHX%ouvwwxlh*~5&I zGsKy9Xs-^Qi3P&f*Y%lm9EohFOJ=MBP~g-*ft>N(ey{kcX!g^a{IdK=G%$0S?6y7I z5>DX@sU3INg-B8>W?_!Z&OR*T8BoS4_$c(PFCcLm4ghkoIbasHyhV^DX#`K4!x|}6 zKFRKsz+N&Y83PMH$?OEfE|`*9aSL;#cFtfI3`rC-A!D|%1%S$;jExf;9d4n{TLg8| zwP}x(5FGt0SgPC%7v--JhShg;L4Dp(av>Vgf8Kr3E#H4ph}q2|UlP%95n9qvc)?QA z&C-2QSYngj2q|(@_V?GQZq>1|(unkDg9%`;QK3_~xwzxG$9Z~s2vv{8LHVVV!F?LD zp8qn>Sh)FasD*KiQ*BpQm?RM4jUI1M<=cG(9`xjVTy@!Xgg@xS8BlfEeMC4Iz=;jm zLot#_NdW-uQF&~Ue}25Q$mF#?|8e~?Hh`*0nkl)TTqW5ce|iV3Fm6%z)M1WJBX#lq d<)c;1f7HK-{)vC$pZM>H{{^bJCoKRv007v2;Pn6i literal 0 HcmV?d00001 diff --git a/tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz.ABOUT b/tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz.ABOUT new file mode 100644 index 0000000..3f9d3c1 --- /dev/null +++ b/tests/data/vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz.ABOUT @@ -0,0 +1,2 @@ +date: 2021-04-08 +download_url: https://github.com/jc-lab/pve-mirror_qemu/blob/51db2d7cf26d05a961ec0ee0eb773594b32cc4a1/tests/data/uefi-boot-images/bios-tables-test.x86_64.iso.qcow2?raw=true diff --git a/tests/data/vmimage/foobar.qcow2.tar.gz b/tests/data/vmimage/foobar.qcow2.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..8bf6215f0020e80ea72b9a3ec0114613e2b1341c GIT binary patch literal 1383 zcmb2|=3uZ}Uz@21Ho_QY<>=_>n)vJ6 zZVSnt6P2N->$3Hjqw1E+QM-#1O_yw%l{)wDue&dPym<0YeO~RSm8%zey3ez_ct7l( z{JZ0Sb9ZIy|7&*+af;dZlZTOkB4AAWpJ0so6TK-l=diyPfPj%sT)t_tR z3a`AL``h;2_tW3oW82ry=G`dx$frZaMe0#E|Ge3McYb#HeY$w9{ry>04{l`0^>4d= zdv)ZsnK@=3_qi`$|L4=6>sOE4|0;O8`}@7hU4L`$?cP zI$mn5|Nrx!9YevEuT^_zN`L*inq9^-pWlzk;U|y%dr#rHb=G7bid-3!A^|znnH`V?-eVhIJ{;eC= z_t)Flo4>p9eD+4`6moKH&kZT|P)gZ(T2lotD6t)GJa2gJRSKjr<` zxpHN_hW=Onr%V4W+;hc#%Jg6DP)RRsT`R@~0;g{-eeC+2K}7k#-21ru|IhwE9%??f zGdlb2*RXGH8$FrWAARzEZTFwTtVC%-8yxx{bgcZQ?DuM+@2~#JOXSL)9eSKM>liQq WlZ09DIk1804=>BVOFRs0Tnqr=u%c=J literal 0 HcmV?d00001 diff --git a/tests/test_archive.py b/tests/test_archive.py index 71b1da2..7dee15f 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -20,6 +20,7 @@ # import os +from pathlib import Path import pytest @@ -215,6 +216,18 @@ def test_get_extractor_for_compressed_svgz_docs(self): expected = [] self.check_get_extractors(test_file, expected, kinds=extractcode.default_kinds) + def test_get_extractor_qcow2(self): + test_file = self.extract_test_tar('vmimage/foobar.qcow2.tar.gz') + test_file = str(Path(test_file) / 'foobar.qcow2') + + expected = [] + self.check_get_extractors(test_file, expected, kinds=extractcode.default_kinds) + + expected = [archive.extract_vm_image] + self.check_get_extractors(test_file, expected, kinds=()) + self.check_get_extractors(test_file, expected, kinds=(extractcode.file_system, )) + self.check_get_extractors(test_file, expected, kinds=extractcode.all_kinds) + def test_get_extractor_for_dia(self): test_file = self.get_test_loc('archive/dia/dia.dia', copy=True) diff --git a/tests/test_vmimage.py b/tests/test_vmimage.py new file mode 100644 index 0000000..76fa9aa --- /dev/null +++ b/tests/test_vmimage.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +# +# Copyright (c) nexB Inc. and others. +# SPDX-License-Identifier: Apache-2.0 +# +# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# ScanCode is a trademark of nexB Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import os +from pathlib import Path + +import pytest + +from commoncode.system import on_linux + +from extractcode_assert_utils import BaseArchiveTestCase +from extractcode_assert_utils import check_files + +from extractcode import vmimage + + +@pytest.mark.skipif(not on_linux, reason='Only linux supports image extraction') +class TestExtractVmImage(BaseArchiveTestCase): + test_data_dir = os.path.join(os.path.dirname(__file__), 'data') + + def test_can_listfs_from_qcow2_image(self): + test_file = self.extract_test_tar('vmimage/foobar.qcow2.tar.gz') + test_file = str(Path(test_file) / 'foobar.qcow2') + vmi = vmimage.VmImage.from_file(test_file) + assert [('/dev/sda', 'ext2')] == vmi.listfs() + + def test_can_extract_qcow2_vm_image_as_tarball(self): + test_file = self.extract_test_tar('vmimage/foobar.qcow2.tar.gz') + test_file = str(Path(test_file) / 'foobar.qcow2') + target_dir = self.get_temp_dir('vmimage') + vmimage.extract(location=test_file, target_dir=target_dir, as_tarballs=True) + expected = ['foobar.qcow2.tar.gz'] + check_files(target_dir, expected) + + def test_can_extract_qcow2_vm_image_not_as_tarball(self): + test_file = self.extract_test_tar('vmimage/bios-tables-test.x86_64.iso.qcow2.tar.gz') + test_file = str(Path(test_file) / 'bios-tables-test.x86_64.iso.qcow2') + target_dir = self.get_temp_dir('vmimage') + vmimage.extract(location=test_file, target_dir=target_dir, as_tarballs=False) + expected = ['bios_tab.fat', 'boot.cat'] + check_files(target_dir, expected) From aa5da29014ce4fbffca53c09689a2623e2b78196 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Thu, 22 Apr 2021 23:10:11 +0200 Subject: [PATCH 10/42] Add CLI scripts copied from scancode-toolkit Signed-off-by: Philippe Ombredanne --- extractcode | 118 ++++++++++++++++++++++++++++++++++++++++++++++++ extractcode.bat | 32 +++++++++++++ 2 files changed, 150 insertions(+) create mode 100755 extractcode create mode 100644 extractcode.bat diff --git a/extractcode b/extractcode new file mode 100755 index 0000000..58ede97 --- /dev/null +++ b/extractcode @@ -0,0 +1,118 @@ +#!/bin/bash +# +# Copyright (c) nexB Inc. and others. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# ScanCode is a trademark of nexB Inc. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. +# +# A minimal shell wrapper to the CLI entry point fo ExtractCode + + +################################################################################### +# from https://raw.githubusercontent.com/mkropat/sh-realpath/58c03982cfd8accbcf0c4426a4adf0f120a8b2bb/realpath.sh +# realpath emulation for portability on *nix +# this allow running scancode from arbitrary locations and from symlinks +# +# Copyright (c) 2014 Michael Kropat +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. + +realpath() { + canonicalize_path "$(resolve_symlinks "$1")" +} + +resolve_symlinks() { + _resolve_symlinks "$1" +} + +_resolve_symlinks() { + _assert_no_path_cycles "$@" || return + + local dir_context path + path=$(readlink -- "$1") + if [ $? -eq 0 ]; then + dir_context=$(dirname -- "$1") + _resolve_symlinks "$(_prepend_dir_context_if_necessary "$dir_context" "$path")" "$@" + else + printf '%s\n' "$1" + fi +} + +_prepend_dir_context_if_necessary() { + if [ "$1" = . ]; then + printf '%s\n' "$2" + else + _prepend_path_if_relative "$1" "$2" + fi +} + +_prepend_path_if_relative() { + case "$2" in + /* ) printf '%s\n' "$2" ;; + * ) printf '%s\n' "$1/$2" ;; + esac +} + +_assert_no_path_cycles() { + local target path + + target=$1 + shift + + for path in "$@"; do + if [ "$path" = "$target" ]; then + return 1 + fi + done +} + +canonicalize_path() { + if [ -d "$1" ]; then + _canonicalize_dir_path "$1" + else + _canonicalize_file_path "$1" + fi +} + +_canonicalize_dir_path() { + (cd "$1" 2>/dev/null && pwd -P) +} + +_canonicalize_file_path() { + local dir file + dir=$(dirname -- "$1") + file=$(basename -- "$1") + (cd "$dir" 2>/dev/null && printf '%s/%s\n' "$(pwd -P)" "$file") +} + +################################################################################### +# Now run scancode proper + +EXTRACTCODE_BIN="$( realpath "${BASH_SOURCE[0]}" )" +EXTRACTCODE_ROOT_DIR="$( cd "$( dirname "${EXTRACTCODE_BIN}" )" && pwd )" + +EXTRACTCODE_CONFIGURED_PYTHON="$EXTRACTCODE_ROOT_DIR/bin/python" +if [ ! -f "$EXTRACTCODE_CONFIGURED_PYTHON" ]; then + echo "* Configuring ExtractCode for first use..." + CONFIGURE_QUIET=1 "$EXTRACTCODE_ROOT_DIR/configure" +fi + +"$EXTRACTCODE_ROOT_DIR/bin/extractcode" "$@" diff --git a/extractcode.bat b/extractcode.bat new file mode 100644 index 0000000..d9d081f --- /dev/null +++ b/extractcode.bat @@ -0,0 +1,32 @@ +@echo OFF + +@rem Copyright (c) nexB Inc. and others. All rights reserved. +@rem SPDX-License-Identifier: Apache-2.0 +@rem See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +@rem ScanCode is a trademark of nexB Inc. +@rem See https://github.com/nexB/extractcode for support or download. +@rem See https://aboutcode.org for more information about nexB OSS projects. + +@rem A wrapper to ExtractCode command line entry point + +set EXTRACTCODE_ROOT_DIR=%~dp0 +set EXTRACTCODE_CONFIGURED_PYTHON=%EXTRACTCODE_ROOT_DIR%Scripts\python.exe + +if not exist "%EXTRACTCODE_CONFIGURED_PYTHON%" goto configure +goto extractcode + +:configure +echo * Configuring ExtractCode for first use... +set CONFIGURE_QUIET=1 +call "%EXTRACTCODE_ROOT_DIR%configure" + +@rem Return a proper return code on failure +if %errorlevel% neq 0 ( + exit /b %errorlevel% +) + +:extractcode +@rem without this things may not always work on Windows 10, but this makes things slower +set PYTHONDONTWRITEBYTECODE=1 + +"%EXTRACTCODE_ROOT_DIR%Scripts\extractcode" %* From 89d01bfe38ed40e076afcc8d7c92283a98ac0612 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 23 Apr 2021 07:36:20 +0200 Subject: [PATCH 11/42] Use correct paths in script Signed-off-by: Philippe Ombredanne --- extractcode | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extractcode b/extractcode index 58ede97..bb91c2a 100755 --- a/extractcode +++ b/extractcode @@ -109,10 +109,10 @@ _canonicalize_file_path() { EXTRACTCODE_BIN="$( realpath "${BASH_SOURCE[0]}" )" EXTRACTCODE_ROOT_DIR="$( cd "$( dirname "${EXTRACTCODE_BIN}" )" && pwd )" -EXTRACTCODE_CONFIGURED_PYTHON="$EXTRACTCODE_ROOT_DIR/bin/python" +EXTRACTCODE_CONFIGURED_PYTHON="$EXTRACTCODE_ROOT_DIR/tmp/bin/python" if [ ! -f "$EXTRACTCODE_CONFIGURED_PYTHON" ]; then echo "* Configuring ExtractCode for first use..." CONFIGURE_QUIET=1 "$EXTRACTCODE_ROOT_DIR/configure" fi -"$EXTRACTCODE_ROOT_DIR/bin/extractcode" "$@" +"$EXTRACTCODE_ROOT_DIR/tmp/bin/extractcode" "$@" From d0ba2c20efa14c288e972932a1fb62d43cebcc13 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 23 Apr 2021 07:37:25 +0200 Subject: [PATCH 12/42] Expand supported VM images types Signed-off-by: Philippe Ombredanne --- src/extractcode/archive.py | 4 ++-- src/extractcode/vmimage.py | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index a396f08..6841fc9 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -1049,9 +1049,9 @@ def try_to_extract(location, target_dir, extractor): QCOWHandler = Handler( # note that there are v1, v2 and v3 formats. name='QEMU QCOW2 disk image', - filetypes=('qemu qcow2 image',), + filetypes=('qemu qcow2 image', 'qemu qcow image',), mimetypes=('application/octet-stream',), - extensions=('.qcow2',), + extensions=('.qcow2', '.qcow', '.qcow2c', '.img',), kind=file_system, extractors=[extract_vm_image], strict=True, diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index 2379b8a..5ee381d 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -137,6 +137,9 @@ def from_file(cls, location): supported_gfs_formats_by_extension = { '.qcow2': 'qcow2', + '.qcow2c': 'qcow2', + '.qcow': 'qcow2', + '.img': 'qcow2', '.vmdk': 'vmdk', '.vdi': 'vdi', } From d6fe59fd2e832075905ecb27235640a2776dad7a Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 7 May 2021 14:56:42 +0200 Subject: [PATCH 13/42] Update markers syntax for pytest Signed-off-by: Philippe Ombredanne --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 55fb92c..a3bda44 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -41,6 +41,6 @@ python_functions = "test" addopts = [ "-rfExXw", - "--strict", + "--strict-markers", "--doctest-modules" ] From ca6ab2189a6ff6fd093dc9424aa17183a05e6988 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 7 May 2021 14:59:17 +0200 Subject: [PATCH 14/42] Add fallback version for setuptools_scm This will work even from a git archive or when git is not installed. Signed-off-by: Philippe Ombredanne --- .gitattributes | 1 + pyproject.toml | 1 + 2 files changed, 2 insertions(+) diff --git a/.gitattributes b/.gitattributes index b79df5c..96c89ce 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,3 @@ # Ignore all Git auto CR/LF line endings conversions * -text +pyproject.toml export-subst diff --git a/pyproject.toml b/pyproject.toml index a3bda44..52caac4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,7 @@ requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 4"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] +fallback_version = "v9999.$Format:%h-%cs$" [tool.pytest.ini_options] norecursedirs = [ From 1364bbbb9c399bd535686ea4ec6bfc241eb0e689 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 10:57:19 +0200 Subject: [PATCH 15/42] Add note for setuptools_scam fallback version Signed-off-by: Philippe Ombredanne --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 52caac4..8eebe91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,6 +3,8 @@ requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 4"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] +# this is used populated when creating a git archive +# and when there is .git dir and/or there is no git installed fallback_version = "v9999.$Format:%h-%cs$" [tool.pytest.ini_options] From be851b017a6e5c98ad85a84cda8b3f070e7acf34 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 11:00:26 +0200 Subject: [PATCH 16/42] Use azure-posix.yml for linux and macOS Signed-off-by: Philippe Ombredanne --- azure-pipelines.yml | 10 +++--- etc/ci/azure-mac.yml | 36 --------------------- etc/ci/{azure-linux.yml => azure-posix.yml} | 0 3 files changed, 5 insertions(+), 41 deletions(-) delete mode 100644 etc/ci/azure-mac.yml rename etc/ci/{azure-linux.yml => azure-posix.yml} (100%) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 9a4c950..31ef36f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -7,7 +7,7 @@ jobs: - - template: etc/ci/azure-linux.yml + - template: etc/ci/azure-posix.yml parameters: job_name: ubuntu16_cpython image_name: ubuntu-16.04 @@ -15,7 +15,7 @@ jobs: test_suites: all: tmp/bin/pytest -vvs - - template: etc/ci/azure-linux.yml + - template: etc/ci/azure-posix.yml parameters: job_name: ubuntu18_cpython image_name: ubuntu-18.04 @@ -23,7 +23,7 @@ jobs: test_suites: all: tmp/bin/pytest -n 2 -vvs - - template: etc/ci/azure-linux.yml + - template: etc/ci/azure-posix.yml parameters: job_name: ubuntu20_cpython image_name: ubuntu-20.04 @@ -31,7 +31,7 @@ jobs: test_suites: all: tmp/bin/pytest -n 2 -vvs - - template: etc/ci/azure-mac.yml + - template: etc/ci/azure-posix.yml parameters: job_name: macos1014_cpython image_name: macos-10.14 @@ -39,7 +39,7 @@ jobs: test_suites: all: tmp/bin/pytest -n 2 -vvs - - template: etc/ci/azure-mac.yml + - template: etc/ci/azure-posix.yml parameters: job_name: macos1015_cpython image_name: macos-10.15 diff --git a/etc/ci/azure-mac.yml b/etc/ci/azure-mac.yml deleted file mode 100644 index 752ae2e..0000000 --- a/etc/ci/azure-mac.yml +++ /dev/null @@ -1,36 +0,0 @@ -parameters: - job_name: '' - image_name: '' - python_versions: [] - test_suites: {} - python_architecture: x64 - -jobs: - - job: ${{ parameters.job_name }} - - pool: - vmImage: ${{ parameters.image_name }} - - strategy: - matrix: - ${{ each pyver in parameters.python_versions }}: - ${{ each tsuite in parameters.test_suites }}: - ${{ format('py{0} {1}', pyver, tsuite.key) }}: - python_version: ${{ pyver }} - test_suite_label: ${{ tsuite.key }} - test_suite: ${{ tsuite.value }} - steps: - - checkout: self - fetchDepth: 10 - - - task: UsePythonVersion@0 - inputs: - versionSpec: '$(python_version)' - architecture: '${{ parameters.python_architecture }}' - displayName: 'Install Python $(python_version)' - - - script: ./configure - displayName: 'Run Configure' - - - script: $(test_suite) - displayName: 'Run $(test_suite_label) tests with py$(python_version) on ${{ parameters.job_name }}' diff --git a/etc/ci/azure-linux.yml b/etc/ci/azure-posix.yml similarity index 100% rename from etc/ci/azure-linux.yml rename to etc/ci/azure-posix.yml From 4f0aecf4f2a01c71b8d0f54987cd68de5f7922c2 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 11:14:23 +0200 Subject: [PATCH 17/42] Adopt new configure script derived from ScanCode Signed-off-by: Philippe Ombredanne --- configure | 164 ++++++++++++++++++++++++---- configure.bat | 238 ++++++++++++++++++++++++++--------------- etc/ci/azure-posix.yml | 7 +- etc/ci/azure-win.yml | 5 +- 4 files changed, 304 insertions(+), 110 deletions(-) diff --git a/configure b/configure index 78e7498..25ab0ce 100755 --- a/configure +++ b/configure @@ -1,43 +1,169 @@ #!/usr/bin/env bash # -# Copyright (c) nexB Inc. and others. +# Copyright (c) nexB Inc. and others. All rights reserved. # SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/ for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # set -e #set -x -# source this script for a basic setup and configuration for local development +################################ +# A configuration script to set things up: +# create a virtualenv and install or update thirdparty packages. +# Source this script for initial configuration +# Use configure --help for details +# +# This script will search for a virtualenv.pyz app in etc/thirdparty/virtualenv.pyz +# Otherwise it will download the latest from the VIRTUALENV_PYZ_URL default +################################ +CLI_ARGS=$1 -CONFIGURE_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +################################ +# Defaults. Change these variables to customize this script +################################ +# Requirement arguments passed to pip and used by default or with --dev. +REQUIREMENTS="--editable ." +DEV_REQUIREMENTS="--editable .[testing]" -if [[ "$1" == "--clean" ]]; then - rm -rf "$CONFIGURE_ROOT_DIR/tmp" - exit +# where we create a virtualenv +VIRTUALENV_DIR=tmp + +# Cleanable files and directories with the --clean option +CLEANABLE=" + build + tmp" + +# extra arguments passed to pip +PIP_EXTRA_ARGS=" " + +# the URL to download virtualenv.pyz if needed +VIRTUALENV_PYZ_URL=https://bootstrap.pypa.io/virtualenv.pyz +################################ + + +################################ +# Current directory where this script lives +CFG_ROOT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +CFG_BIN_DIR=$CFG_ROOT_DIR/$VIRTUALENV_DIR/bin + + +################################ +# Set the quiet flag to empty if not defined +if [[ "$CFG_QUIET" == "" ]]; then + CFG_QUIET=" " fi -if [[ "$PYTHON_EXE" == "" ]]; then - PYTHON_EXE=python3 +################################ +# find a proper Python to run +# Use environment variables or a file if available. +# Otherwise the latest Python by default. +if [[ "$PYTHON_EXECUTABLE" == "" ]]; then + # check for a file named PYTHON_EXECUTABLE + if [ -f "$CFG_ROOT_DIR/PYTHON_EXECUTABLE" ]; then + PYTHON_EXECUTABLE=$(cat "$CFG_ROOT_DIR/PYTHON_EXECUTABLE") + else + PYTHON_EXECUTABLE=python3 + fi fi -function setup { - # create a virtualenv on Python - mkdir -p $CONFIGURE_ROOT_DIR/tmp - wget -O $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz https://bootstrap.pypa.io/virtualenv.pyz - $PYTHON_EXE $CONFIGURE_ROOT_DIR/tmp/virtualenv.pyz --wheel embed --pip embed --setuptools embed --seeder pip $CONFIGURE_ROOT_DIR/tmp - source $CONFIGURE_ROOT_DIR/tmp/bin/activate +################################ +cli_help() { + echo An initial configuration script + echo " usage: ./configure [options]" + echo + echo The default is to configure for regular use. Use --dev for development. + echo + echo The options are: + echo " --clean: clean built and installed files and exit." + echo " --dev: configure the environment for development." + echo " --help: display this help message and exit." + echo + echo By default, the python interpreter version found in the path is used. + echo Alternatively, the PYTHON_EXECUTABLE environment variable can be set to + echo configure another Python executable interpreter to use. If this is not + echo set, a file named PYTHON_EXECUTABLE containing a single line with the + echo path of the Python executable to use will be checked last. + set +e + exit } -setup +clean() { + # Remove cleanable file and directories and files from the root dir. + echo "* Cleaning ..." + for cln in $CLEANABLE; + do rm -rf "${CFG_ROOT_DIR:?}/${cln:?}"; + done + set +e + exit +} -$CONFIGURE_ROOT_DIR/tmp/bin/pip install -e .[testing] -if [ -f "$CONFIGURE_ROOT_DIR/tmp/bin/activate" ]; then - source "$CONFIGURE_ROOT_DIR/tmp/bin/activate" -fi +create_virtualenv() { + # create a virtualenv for Python + # Note: we do not use the bundled Python 3 "venv" because its behavior and + # presence is not consistent across Linux distro and sometimes pip is not + # included either by default. The virtualenv.pyz app cures all these issues. + + VENV_DIR="$1" + if [ ! -f "$CFG_BIN_DIR/python" ]; then + + mkdir -p "$CFG_ROOT_DIR/$VENV_DIR" + + if [ -f "$CFG_ROOT_DIR/etc/thirdparty/virtualenv.pyz" ]; then + VIRTUALENV_PYZ="$CFG_ROOT_DIR/etc/thirdparty/virtualenv.pyz" + else + VIRTUALENV_PYZ="$CFG_ROOT_DIR/$VENV_DIR/virtualenv.pyz" + wget -O "$VIRTUALENV_PYZ" "$VIRTUALENV_PYZ_URL" + fi + + $PYTHON_EXECUTABLE "$VIRTUALENV_PYZ" \ + --wheel embed --pip embed --setuptools embed \ + --seeder pip \ + --never-download \ + --no-periodic-update \ + --no-vcs-ignore \ + $CFG_QUIET \ + "$CFG_ROOT_DIR/$VENV_DIR" + fi +} + + +install_packages() { + # install requirements in virtualenv + # note: --no-build-isolation means that pip/wheel/setuptools will not + # be reinstalled a second time and reused from the virtualenv and this + # speeds up the installation. + # We always have the PEP517 build dependencies installed already. + + "$CFG_BIN_DIR/pip" install \ + --upgrade \ + --no-build-isolation \ + $CFG_QUIET \ + $PIP_EXTRA_ARGS \ + $1 +} + + +################################ +# Main command line entry point +CFG_DEV_MODE=0 +CFG_REQUIREMENTS=$REQUIREMENTS + +case "$CLI_ARGS" in + --help) cli_help;; + --clean) clean;; + --dev) CFG_REQUIREMENTS="$DEV_REQUIREMENTS" && CFG_DEV_MODE=1;; +esac + +create_virtualenv "$VIRTUALENV_DIR" +install_packages "$CFG_REQUIREMENTS" +. "$CFG_BIN_DIR/activate" set +e diff --git a/configure.bat b/configure.bat index 00cb101..8c497ba 100644 --- a/configure.bat +++ b/configure.bat @@ -1,120 +1,180 @@ @echo OFF @setlocal -@rem Copyright (c) nexB Inc. http://www.nexb.com/ - All rights reserved. + +@rem Copyright (c) nexB Inc. and others. All rights reserved. +@rem SPDX-License-Identifier: Apache-2.0 +@rem See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +@rem See https://github.com/nexB/ for support or download. +@rem See https://aboutcode.org for more information about nexB OSS projects. + @rem ################################ -@rem # A configuration script for Windows -@rem # -@rem # The options and (optional) arguments are: -@rem # --clean : this is exclusive of anything else and cleans the environment -@rem # from built and installed files -@rem # -@rem # --python < path to python.exe> : this must be the first argument and set -@rem # the path to the Python executable to use. If < path to python.exe> is -@rem # set to "path", then the executable will be the python.exe available -@rem # in the PATH. +@rem # A configuration script to set things up: +@rem # create a virtualenv and install or update thirdparty packages. +@rem # Source this script for initial configuration +@rem # Use configure --help for details + +@rem # This script will search for a virtualenv.pyz app in etc\thirdparty\virtualenv.pyz +@rem # Otherwise it will download the latest from the VIRTUALENV_PYZ_URL default @rem ################################ -@rem Current directory where this .bat files lives -set CFG_ROOT_DIR=%~dp0 -@rem path where a configured Python should live in the current virtualenv if installed -set CONFIGURED_PYTHON=%CFG_ROOT_DIR%tmp\Scripts\python.exe -set PYTHON_EXECUTABLE= +@rem ################################ +@rem # Defaults. Change these variables to customize this script +@rem ################################ + +@rem # Requirement arguments passed to pip and used by default or with --dev. +set "REQUIREMENTS=--editable ." +set "DEV_REQUIREMENTS=--editable .[testing]" + +@rem # where we create a virtualenv +set "VIRTUALENV_DIR=tmp" + +@rem # Cleanable files and directories to delete with the --clean option +set "CLEANABLE=build tmp" -@rem parse command line options and arguments -:collectopts -if "%1" EQU "--help" (goto cli_help) -if "%1" EQU "--clean" (call rmdir /s /q "%CFG_ROOT_DIR%tmp") && call exit /b -if "%1" EQU "--python" (set PROVIDED_PYTHON=%~2) && shift && shift && goto collectopts +@rem # extra arguments passed to pip +set "PIP_EXTRA_ARGS= " -@rem If we have a pre-configured Python in our virtualenv, reuse this as-is and run -if exist ""%CONFIGURED_PYTHON%"" ( - set PYTHON_EXECUTABLE=%CONFIGURED_PYTHON% - goto run +@rem # the URL to download virtualenv.pyz if needed +set VIRTUALENV_PYZ_URL=https://bootstrap.pypa.io/virtualenv.pyz +@rem ################################ + + +@rem ################################ +@rem # Current directory where this script lives +set CFG_ROOT_DIR=%~dp0 +set "CFG_BIN_DIR=%CFG_ROOT_DIR%\%VIRTUALENV_DIR%\Scripts" + + +@rem ################################ +@rem # Set the quiet flag to empty if not defined +if not defined CFG_QUIET ( + set "CFG_QUIET= " ) -@rem If we have a command arg for Python use this as-is -if ""%PROVIDED_PYTHON%""==""path"" ( - @rem use a bare python available in the PATH - set PYTHON_EXECUTABLE=python - goto run + +@rem ################################ +@rem # Main command line entry point +set CFG_DEV_MODE=0 +set "CFG_REQUIREMENTS=%REQUIREMENTS%" + +if "%1" EQU "--help" (goto cli_help) +if "%1" EQU "--clean" (goto clean) +if "%1" EQU "--dev" ( + set "CFG_REQUIREMENTS=%DEV_REQUIREMENTS%" + set CFG_DEV_MODE=1 ) -if exist ""%PROVIDED_PYTHON%"" ( - set PYTHON_EXECUTABLE=%PROVIDED_PYTHON% - goto run +if "%1" EQU "--python" ( + echo "The --python is now DEPRECATED. Use the PYTHON_EXECUTABLE environment + echo "variable instead. Run configure --help for details." + exit /b 0 ) +@rem ################################ +@rem # find a proper Python to run +@rem # Use environment variables or a file if available. +@rem # Otherwise the latest Python by default. +if not defined PYTHON_EXECUTABLE ( + @rem # check for a file named PYTHON_EXECUTABLE + if exist ""%CFG_ROOT_DIR%\PYTHON_EXECUTABLE"" ( + set /p PYTHON_EXECUTABLE=<""%CFG_ROOT_DIR%\PYTHON_EXECUTABLE"" + ) else ( + set "PYTHON_EXECUTABLE=py" + ) +) -@rem otherwise we search for a suitable Python interpreter -:find_python -@rem First check the existence of the "py" launcher (available in Python 3) -@rem if we have it, check if we have a py -3 installed with the good version or a py 2.7 -@rem if not, check if we have an old py 2.7 -@rem exist if all fails +:create_virtualenv +@rem # create a virtualenv for Python +@rem # Note: we do not use the bundled Python 3 "venv" because its behavior and +@rem # presence is not consistent across Linux distro and sometimes pip is not +@rem # included either by default. The virtualenv.pyz app cures all these issues. -where py >nul 2>nul -if %ERRORLEVEL% == 0 ( - @rem we have a py launcher, check for the availability of our required Python 3 version - py -3.6 --version >nul 2>nul - if %ERRORLEVEL% == 0 ( - set PYTHON_EXECUTABLE=py -3.6 - ) else ( - @rem we have no required python 3, let's try python 2: - py -2 --version >nul 2>nul - if %ERRORLEVEL% == 0 ( - set PYTHON_EXECUTABLE=py -2 - ) else ( - @rem we have py and no python 3 and 2, exit - echo * Unable to find an installation of Python. - exit /b 1 - ) +if not exist ""%CFG_BIN_DIR%\python.exe"" ( + if not exist "%CFG_BIN_DIR%" ( + mkdir %CFG_BIN_DIR% ) -) else ( - @rem we have no py launcher, check for a default Python 2 installation - if not exist ""%DEFAULT_PYTHON2%"" ( - echo * Unable to find an installation of Python. - exit /b 1 + + if exist ""%CFG_ROOT_DIR%\etc\thirdparty\virtualenv.pyz"" ( + %PYTHON_EXECUTABLE% "%CFG_ROOT_DIR%\etc\thirdparty\virtualenv.pyz" ^ + --wheel embed --pip embed --setuptools embed ^ + --seeder pip ^ + --never-download ^ + --no-periodic-update ^ + --no-vcs-ignore ^ + %CFG_QUIET% ^ + %CFG_ROOT_DIR%\%VIRTUALENV_DIR% ) else ( - set PYTHON_EXECUTABLE=%DEFAULT_PYTHON2% + if not exist ""%CFG_ROOT_DIR%\%VIRTUALENV_DIR%\virtualenv.pyz"" ( + curl -o "%CFG_ROOT_DIR%\%VIRTUALENV_DIR%\virtualenv.pyz" %VIRTUALENV_PYZ_URL% + + if %ERRORLEVEL% neq 0 ( + exit /b %ERRORLEVEL% + ) + ) + %PYTHON_EXECUTABLE% "%CFG_ROOT_DIR%\%VIRTUALENV_DIR%\virtualenv.pyz" ^ + --wheel embed --pip embed --setuptools embed ^ + --seeder pip ^ + --never-download ^ + --no-periodic-update ^ + --no-vcs-ignore ^ + %CFG_QUIET% ^ + %CFG_ROOT_DIR%\%VIRTUALENV_DIR% ) ) +if %ERRORLEVEL% neq 0 ( + exit /b %ERRORLEVEL% +) + -:run -@rem without this things may not always work on Windows 10, but this makes things slower -set PYTHONDONTWRITEBYTECODE=1 +:install_packages +@rem # install requirements in virtualenv +@rem # note: --no-build-isolation means that pip/wheel/setuptools will not +@rem # be reinstalled a second time and reused from the virtualenv and this +@rem # speeds up the installation. +@rem # We always have the PEP517 build dependencies installed already. -call mkdir "%CFG_ROOT_DIR%tmp" -call curl -o "%CFG_ROOT_DIR%tmp\virtualenv.pyz" https://bootstrap.pypa.io/virtualenv.pyz -call %PYTHON_EXECUTABLE% "%CFG_ROOT_DIR%tmp\virtualenv.pyz" --wheel embed --pip embed --setuptools embed --seeder pip "%CFG_ROOT_DIR%tmp" -call "%CFG_ROOT_DIR%tmp\Scripts\activate" -call "%CFG_ROOT_DIR%tmp\Scripts\pip" install -e .[testing] +%CFG_BIN_DIR%\pip install ^ + --upgrade ^ + --no-build-isolation ^ + %CFG_QUIET% ^ + %PIP_EXTRA_ARGS% ^ + %CFG_REQUIREMENTS% -@rem Return a proper return code on failure if %ERRORLEVEL% neq 0 ( exit /b %ERRORLEVEL% ) -endlocal -goto activate +exit /b 0 + + +@rem ################################ :cli_help -echo A configuration script for Windows -echo usage: configure [options] [path/to/config/directory] -echo. -echo The options and arguments are: -echo --clean : this is exclusive of anything else and cleans the environment -echo from built and installed files -echo. -echo --python path/to/python.exe : this is set to the path of an alternative -echo Python executable to use. If path/to/python.exe is set to "path", -echo then the executable will be the python.exe available in the PATH. -echo. - - -:activate -@rem Activate the virtualenv -if exist "%CFG_ROOT_DIR%tmp\Scripts\activate" ( - "%CFG_ROOT_DIR%tmp\Scripts\activate" + echo An initial configuration script + echo " usage: configure [options]" + echo " " + echo The default is to configure for regular use. Use --dev for development. + echo " " + echo The options are: + echo " --clean: clean built and installed files and exit." + echo " --dev: configure the environment for development." + echo " --help: display this help message and exit." + echo " " + echo By default, the python interpreter version found in the path is used. + echo Alternatively, the PYTHON_EXECUTABLE environment variable can be set to + echo configure another Python executable interpreter to use. If this is not + echo set, a file named PYTHON_EXECUTABLE containing a single line with the + echo path of the Python executable to use will be checked last. + exit /b 0 + + +:clean +@rem # Remove cleanable file and directories and files from the root dir. +echo "* Cleaning ..." +for %%F in (%CLEANABLE%) do ( + rmdir /s /q "%CFG_ROOT_DIR%\%%F" >nul 2>&1 + del /f /q "%CFG_ROOT_DIR%\%%F" >nul 2>&1 ) +exit /b 0 diff --git a/etc/ci/azure-posix.yml b/etc/ci/azure-posix.yml index 752ae2e..0921d9b 100644 --- a/etc/ci/azure-posix.yml +++ b/etc/ci/azure-posix.yml @@ -19,6 +19,7 @@ jobs: python_version: ${{ pyver }} test_suite_label: ${{ tsuite.key }} test_suite: ${{ tsuite.value }} + steps: - checkout: self fetchDepth: 10 @@ -29,7 +30,11 @@ jobs: architecture: '${{ parameters.python_architecture }}' displayName: 'Install Python $(python_version)' - - script: ./configure + - script: | + python3 --version + python$(python_version) --version + echo "python$(python_version)" > PYTHON_EXECUTABLE + ./configure --dev displayName: 'Run Configure' - script: $(test_suite) diff --git a/etc/ci/azure-win.yml b/etc/ci/azure-win.yml index afe1686..03d8927 100644 --- a/etc/ci/azure-win.yml +++ b/etc/ci/azure-win.yml @@ -29,7 +29,10 @@ jobs: architecture: '${{ parameters.python_architecture }}' displayName: 'Install Python $(python_version)' - - script: configure --python path + - script: | + python --version + echo | set /p=python> PYTHON_EXECUTABLE + configure --dev displayName: 'Run Configure' - script: $(test_suite) From aa04429ae6e5d05ef8ee2a0fbad9872014463a25 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 11:17:09 +0200 Subject: [PATCH 18/42] Add notes on customization Signed-off-by: Philippe Ombredanne --- README.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.rst b/README.rst index a0e682f..a291173 100644 --- a/README.rst +++ b/README.rst @@ -32,3 +32,12 @@ Update an existing project git merge skeleton/main --allow-unrelated-histories This is also the workflow to use when updating the skeleton files in any given repository. + + +Customizing +----------- + +You typically want to perform these customizations: + +- remove or update the src/README.rst and tests/README.rst files +- check the configure and configure.bat defaults From 56ada8fffacac14140bf016fd3f6bee4f4615fcc Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 11:19:12 +0200 Subject: [PATCH 19/42] Adopt new configure --dev convention Signed-off-by: Philippe Ombredanne --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 7a342df..1b52eb2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,7 +15,7 @@ python: - "3.8" # Scripts to run at install stage -install: ./configure +install: ./configure --dev # Scripts to run at script stage script: tmp/bin/pytest From 0dbcdc9f6c929b3d030910a69e5566c149e15d7a Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 11:21:48 +0200 Subject: [PATCH 20/42] Clarify CHANGELOG to be Rst Signed-off-by: Philippe Ombredanne --- CHANGELOG.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 5f8bc8d..fc2b6e3 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,5 +1,8 @@ -Release notes -------------- -### Version 0.0.0 +Changelog +========= + + +v0.0.0 +------ *xxxx-xx-xx* -- Initial release. From d21aef35a61675289bbebf963030b539c10a7b28 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 11 May 2021 11:22:22 +0200 Subject: [PATCH 21/42] Add skeleton release notes to README.rst This was they do not end up in the template CHANGELOG.rst Signed-off-by: Philippe Ombredanne --- README.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.rst b/README.rst index a291173..b84a049 100644 --- a/README.rst +++ b/README.rst @@ -41,3 +41,10 @@ You typically want to perform these customizations: - remove or update the src/README.rst and tests/README.rst files - check the configure and configure.bat defaults + + +Release Notes +------------- + +- 2021-05-11: adopt new configure scripts from ScanCode TK that allows correct + configuration of which Python version is used. From ab707b5b94ed50ac050b6e93b8e60b5ad00d450e Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Wed, 12 May 2021 00:12:32 +0200 Subject: [PATCH 22/42] Improve README Signed-off-by: Philippe Ombredanne --- README.rst | 72 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index 89354cb..b3cc7bc 100644 --- a/README.rst +++ b/README.rst @@ -50,39 +50,75 @@ To run the command line tool in the activated environment:: ./extractcode -h -Adding support for VM images ----------------------------- +Adding support for VM images extraction +--------------------------------------- -Adding support for VM images requires the manual installation of libguestfs and -it Python binding. You will need to install the libguestfs tools system package. -On Debian and Ubuntu:: +Adding support for VM images requires the manual installation of libguestfs +tools system package. This is suport on Linux only. On Debian and Ubuntu you can +use this:: sudo apt-get install libguestfs-tools -On Ubuntu, a manual stpe is required if the kernel executable file cannot be read. -This is required by guestfish and libguestfs and this is an oddity there and not on Debian. +On Ubuntu only, an additional manual step is required as the kernel executable +file cannot be read as required by libguestfish. -Run this command as a temporary fix:: +Run this command as a temporary and immediate fix:: for k in /boot/vmlinuz-* - do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$(uname -r) + do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$k done -or:: - sudo chmod +r /boot/vmlinuz-*, +But you likely want both this temporary fix and a permanent fix; otherwise each +kernel update will revert to the default permissions and extractcode will stop +working for VM images extraction. +Therefore follow these instructions: -For a permanent fix see: +1. As sudo, create the file /etc/kernel/postinst.d/statoverride with this +content, devised by Kees Cook (@kees) in +https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725/comments/3 :: - - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662/comments/21 + #!/bin/sh + version="$1" + # passing the kernel version is required + [ -z "${version}" ] && exit 0 + dpkg-statoverride --update --add root root 0644 /boot/vmlinuz-${version} -See also for a discussion: - - - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 - - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 - - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 +2. Set executable permissions:: + sudo chmod +x /etc/kernel/postinst.d/statoverride +See also for a complete discussion: + - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 + - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 + - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662/comments/24 + + +Configuration with environment variables +---------------------------------------- + +ExtractCode will use these environment variables if set: + +- EXTRACTCODE_GUESTFISH_PATH : the path to the ``guestfish`` tool from + libguestfs to use to extract VM images. If not provided, ExtractCode will look + in the PATH for an installed ``guestfish`` executable instead. + +- EXTRACTCODE_LIBARCHIVE_PATH : the path to the ``libarchive.so`` libarchive + shared library used to support some of the archive formats. If not provided, + ExtractCode will look for a plugin-provided libarchive library path. See + https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. + + If no plugin contributes libarchive, then a final attempt is made to look for + it in the PATH using standard DLL loading techniques. + +- EXTRACTCODE_7Z_PATH : the path to the ``7z`` 7zip executable used to support + some of the archive formats. If not provided, ExtractCode will look for a + plugin-provided 7z executable path. See + https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. + + If no plugin contributes 7z, then a final attempt is made to look for + it in the PATH. + \ No newline at end of file From 8163e2aa08f198c804915470f0aaa4021f4e4394 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Wed, 12 May 2021 00:14:28 +0200 Subject: [PATCH 23/42] Implement new envt. variables approach We now load native libraries and executables from: 1. an envt. variable path 2. OR a locatin provider plugin 3. OR the PATH or we fail Signed-off-by: Philippe Ombredanne --- src/extractcode/libarchive2.py | 32 +++++++++++++++++------ src/extractcode/sevenzip.py | 46 +++++++++++++++++++++++----------- src/extractcode/vmimage.py | 42 +++++++++++-------------------- 3 files changed, 71 insertions(+), 49 deletions(-) diff --git a/src/extractcode/libarchive2.py b/src/extractcode/libarchive2.py index 70ab4f5..26f7a67 100644 --- a/src/extractcode/libarchive2.py +++ b/src/extractcode/libarchive2.py @@ -37,6 +37,7 @@ from commoncode import fileutils from commoncode import paths from commoncode import text +from commoncode.system import on_windows import extractcode from extractcode import ExtractError @@ -83,26 +84,41 @@ """ # keys for plugin-provided locations -EXTRACTCODE_LIBARCHIVE_LIBDIR = 'extractcode.libarchive.libdir' EXTRACTCODE_LIBARCHIVE_DLL = 'extractcode.libarchive.dll' +EXTRACTCODE_LIBARCHIVE_PATH_ENVVAR = 'EXTRACTCODE_LIBARCHIVE_PATH' + def load_lib(): """ - Return the loaded libarchive shared library object from plugin-provided path. + Return the libarchive shared library object loaded from either: + - an environment variable ``EXTRACTCODE_LIBARCHIVE_PATH`` + - a plugin-provided path, + - the system PATH. + Raise an Exception if no libarchive can be found. """ from plugincode.location_provider import get_location - dll = get_location(EXTRACTCODE_LIBARCHIVE_DLL) - libdir = get_location(EXTRACTCODE_LIBARCHIVE_LIBDIR) - if not (dll and libdir) or not os.path.isfile(dll) or not os.path.isdir(libdir): + # try the environment first + dll_loc = os.environ.get(EXTRACTCODE_LIBARCHIVE_PATH_ENVVAR) + + # try a plugin-provided path second + if not dll_loc: + dll_loc = get_location(EXTRACTCODE_LIBARCHIVE_DLL) + + # try the PATH + if not dll_loc: + dll = 'libarchive.dll' if on_windows else 'libarchive.so' + dll_loc = command.find_in_path(dll) + + if not dll_loc or not os.path.isfile(dll_loc): raise Exception( 'CRITICAL: libarchive DLL is not installed. ' 'Unable to continue: you need to install a valid extractcode-libarchive ' - 'plugin with a valid libarchive DLL available.' + 'plugin with a valid libarchive DLL available. ' + f'OR set the {EXTRACTCODE_LIBARCHIVE_PATH_ENVVAR} environment variable.' ) - return command.load_shared_library(dll, libdir) - + return command.load_shared_library(dll_loc) def set_env_with_tz(): diff --git a/src/extractcode/sevenzip.py b/src/extractcode/sevenzip.py index 777b5ed..797eaec 100644 --- a/src/extractcode/sevenzip.py +++ b/src/extractcode/sevenzip.py @@ -56,10 +56,11 @@ logging.basicConfig(stream=sys.stdout) logger.setLevel(logging.DEBUG) -# keys for plugin-provided locations -EXTRACTCODE_7ZIP_LIBDIR = 'extractcode.sevenzip.libdir' +# key of a plugin-provided location EXTRACTCODE_7ZIP_EXE = 'extractcode.sevenzip.exe' +EXTRACTCODE_7ZIP_PATH_ENVVAR = 'EXTRACTCODE_7Z_PATH' + sevenzip_errors = [ ('unsupported method', 'Unsupported archive or broken archive'), ('wrong password', 'Password protected archive, unable to extract'), @@ -71,22 +72,40 @@ UNKNOWN_ERROR = 'Unknown extraction error' -def get_bin_locations(): +def get_command_location(_cache=[]): """ - Return a tuple of (lib_dir, cmd_loc) for 7zip loaded from plugin-provided path. + Return the location of a 7zip loaded from either: + - an environment variable ``EXTRACTCODE_7Z_PATH``, + - a plugin-provided path, + - the system PATH. + Raise an Exception if no 7Zip command can be found. """ + if _cache: + return _cache[0] + from plugincode.location_provider import get_location - cmd_loc = get_location(EXTRACTCODE_7ZIP_EXE) - libdir = get_location(EXTRACTCODE_7ZIP_LIBDIR) - if not (cmd_loc and libdir) or not os.path.isfile(cmd_loc) or not os.path.isdir(libdir): + # try the environment first + cmd_loc = os.environ.get(EXTRACTCODE_7ZIP_PATH_ENVVAR) + + # try a plugin-provided path second + if not cmd_loc: + cmd_loc = get_location(EXTRACTCODE_7ZIP_EXE) + + # try the PATH + if not cmd_loc: + cmd = '7z.exe' if on_windows else '7z' + cmd_loc = command.find_in_path(cmd) + + if not cmd_loc or not os.path.isfile(cmd_loc): raise Exception( 'CRITICAL: 7zip executable is not installed. ' 'Unable to continue: you need to install a valid extractcode-7z ' - 'plugin with a valid executable available.' + 'plugin with a valid executable available. ' + 'OR set the EXTRACTCODE_7ZIP_PATH environment variable.' ) - - return libdir, cmd_loc + _cache.append(cmd_loc) + return cmd_loc def get_7z_errors(stdout, stderr): @@ -315,12 +334,11 @@ def build_7z_extract_command(location, target_dir, single_entry=None, arch_type= if single_entry: args += [shlex_quote(single_entry.path)] - lib_dir, cmd_loc = get_bin_locations() + cmd_loc = get_command_location() ex_args = dict( cmd_loc=cmd_loc, args=args, - lib_dir=lib_dir, cwd=target_dir, env=timezone, ) @@ -387,7 +405,7 @@ def extract_file_by_file(location, target_dir, arch_type='*', skip_symlinks=True single_entry=entry, arch_type=arch_type, ) - rc, stdout, stderr = command.execute2(**ex_args) + rc, stdout, stderr = command.execute(**ex_args) error = get_7z_errors(stdout, stderr) if error or rc != 0: @@ -488,7 +506,7 @@ def list_entries(location, arch_type='*'): abs_location, ] - lib_dir, cmd_loc = get_bin_locations() + cmd_loc = get_command_location() rc, stdout, stderr = command.execute2( cmd_loc=cmd_loc, diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index 5ee381d..c9e5ae0 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -38,12 +38,6 @@ Works only if libguestfs tool guestfish is in the path. See https://libguestfs.org/ - -On Ubuntu, you may face this issue when running guestfish: - -- https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 -- https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 -- https://unix.stackexchange.com/a/642914/185837 """ logger = logging.getLogger(__name__) @@ -59,9 +53,16 @@ 'WARNING: guestfish executable is not installed. ' 'Unable to extract virtual machine image: you need to install the ' 'guestfish tool from libguestfs and extra FS drivers if needed. ' - 'See https://libguestfs.org/ for details.' + 'See the ExtractCode README.rst and https://libguestfs.org/ for details.' ) +GUESTFISH_KERNEL_NOT_READABLE = ( +'''libguestfs requires the kernel executable to be readable. +This is the case by default on most Linux distributions except on Ubuntu. +Please follow the instructions in ExtractCode installation guide to make this happen. +See the ExtractCode README.rst for details. +''') + EXTRACTCODE_GUESTFISH_PATH_ENVVAR = 'EXTRACTCODE_GUESTFISH_PATH' @@ -89,26 +90,15 @@ def check_linux_kernel_is_readable(): - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 """ - error = ( - 'libguestfs requires the kernel executable to be readable. ' - 'This is the case on most Linux distribution except on Ubuntu.\n' - 'Run this command as a temporary fix:\n' - ' for k in /boot/vmlinuz-*\n' - ' do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$(uname -r)\n' - ' done\n' - 'or:\n' - ' sudo chmod +r /boot/vmlinuz-*\n\n', - 'For a permanent fix see: ' - 'https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662/comments/21' - ) + if on_linux: kernels = list(pathlib.Path('/boot').glob('vmlinuz-*')) if not kernels: - raise ExtractErrorFailedToExtract(error) + raise ExtractErrorFailedToExtract(GUESTFISH_KERNEL_NOT_READABLE) for kern in kernels: if not os.access(kern, os.R_OK): raise ExtractErrorFailedToExtract( - f'Unable to read kernel at: {kern}.\n{error}') + f'Unable to read kernel at: {kern}.\n{GUESTFISH_KERNEL_NOT_READABLE}') @attr.s @@ -250,7 +240,7 @@ def run_guestfish(self, args, timeout=None): return as_unicode(stdout) -def extract(location, target_dir, as_tarballs=True): +def extract(location, target_dir, as_tarballs=False): """ Extract all files from a guestfish-supported VM image archive file at location in the target_dir directory. Optionally only extract the @@ -306,8 +296,7 @@ def extract(location, target_dir, as_tarballs=True): # we can safely extract this to a root / dir as we have only one partition partition, _parttype = partitions[0] if not as_tarballs: - intermediate_dir = fileutils.get_temp_dir(prefix='extractcode-vmimage') - tdir = intermediate_dir + tdir = fileutils.get_temp_dir(prefix='extractcode-vmimage') else: tdir = target_dir @@ -329,8 +318,7 @@ def extract(location, target_dir, as_tarballs=True): base_name = partition.replace('/', '-') if not as_tarballs: - intermediate_dir = fileutils.get_temp_dir(prefix='extractcode-vmimage') - tdir = intermediate_dir + tdir = fileutils.get_temp_dir(prefix='extractcode-vmimage') else: tdir = target_dir @@ -344,7 +332,7 @@ def extract(location, target_dir, as_tarballs=True): fileutils.create_dir(partition_target_dir) warns = extract_image_tarball( tarball=target_tarball, - target_dir=target_dir, + target_dir=partition_target_dir, skip_symlinks=False) warnings.extend(warns) From acb85c001942e27c15ba1d0c57145e85d55202b8 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 28 May 2021 18:27:47 +0200 Subject: [PATCH 24/42] Streamline default kinds code Reuse variables. Signed-off-by: Philippe Ombredanne --- src/extractcode/__init__.py | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/extractcode/__init__.py b/src/extractcode/__init__.py index fb7ee0f..7b97c83 100644 --- a/src/extractcode/__init__.py +++ b/src/extractcode/__init__.py @@ -42,8 +42,6 @@ logging.basicConfig(level=logging.DEBUG, stream=sys.stdout) logger.setLevel(logging.DEBUG) -root_dir = join(dirname(__file__), 'bin') - # Suffix added to extracted target_dir paths EXTRACT_SUFFIX = '-extract' @@ -66,13 +64,26 @@ 7: 'special_package', } -# note: do not include special_package in all by default -all_kinds = (regular, regular_nested, package, file_system, docs, patches, special_package) -default_kinds = (regular, regular_nested, package) +# note: we do not include special_package in all_kinds by default +all_kinds = ( + regular, + regular_nested, + package, + file_system, + docs, + patches, + special_package, +) + +default_kinds = ( + regular, + regular_nested, + package, +) # map user-visible extract types to tuples of "kinds" extract_types = { - 'default': (regular, regular_nested, package,), + 'default': default_kinds, 'all': all_kinds, 'package': (package,), 'filesystem': (file_system,), From 15d43d5dd9aba5900b00120da3452624f3721ece Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 28 May 2021 18:28:10 +0200 Subject: [PATCH 25/42] Format code Signed-off-by: Philippe Ombredanne --- src/extractcode/archive.py | 21 +++++++++++++++++---- src/extractcode/libarchive2.py | 3 ++- src/extractcode/sevenzip.py | 3 ++- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index 6841fc9..fcc73a5 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -112,12 +112,16 @@ def should_extract(location, kinds, ignore_pattern=()): Return True if this location should be extracted based on the provided kinds """ location = os.path.abspath(os.path.expanduser(location)) - ignore_pattern = {extension : 'User ignore: Supplied by --ignore' for extension in ignore_pattern} + ignore_pattern = {extension : 'User ignore: Supplied by --ignore' + for extension in ignore_pattern} should_ignore = is_ignored(location, ignore_pattern) extractor = get_extractor(location, kinds=kinds) if TRACE_DEEP: - logger.debug(f' should_extract: extractor: {extractor}, should_ignore: {should_ignore}') + logger.debug( + f' should_extract: extractor: {extractor}, ' + f'should_ignore: {should_ignore}' + ) if extractor and not should_ignore: return True @@ -125,8 +129,17 @@ def should_extract(location, kinds, ignore_pattern=()): def get_extractor(location, kinds=all_kinds): """ - Return an extraction callable that can extract the file at location or - an None if no extract function is found. + Return an extraction callable that can extract the file at ``location`` or + None if no extraction callable function is found. + Limit the search for an extractor to the ``kinds`` list of archive kinds. + See extractcode.all_kinds for details. + + An extraction callable should accept these arguments: + - location of the file to extract + - target_dir where to extract + It should extract files from the `location` in the `target_dir` directory. + It must return a list of warning messages if any or an empty list. + It must raise Exceptions on errors. """ assert location location = os.path.abspath(os.path.expanduser(location)) diff --git a/src/extractcode/libarchive2.py b/src/extractcode/libarchive2.py index 26f7a67..3592efb 100644 --- a/src/extractcode/libarchive2.py +++ b/src/extractcode/libarchive2.py @@ -185,7 +185,8 @@ def extract(location, target_dir, skip_symlinks=True): logger.debug('skipping: {}'.format(entry)) if entry.issym and not skip_symlinks: - raise NotImplemented('extraction of symlinks with libarchive is not yet implemented.') + raise NotImplemented( + 'extraction of symlinks with libarchive is not yet implemented.') continue if TRACE: diff --git a/src/extractcode/sevenzip.py b/src/extractcode/sevenzip.py index 797eaec..afae3e4 100644 --- a/src/extractcode/sevenzip.py +++ b/src/extractcode/sevenzip.py @@ -230,7 +230,8 @@ def extract(location, target_dir, arch_type='*', file_by_file=on_mac, skip_symli location=abs_location, target_dir=abs_target_dir, arch_type=arch_type, - skip_symlinks=skip_symlinks) + skip_symlinks=skip_symlinks, + ) def extract_all_files_at_once(location, target_dir, arch_type='*', skip_symlinks=True): From a63c8491ba7e4627ffb61abfa0f151d8fea0059b Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Fri, 28 May 2021 18:30:47 +0200 Subject: [PATCH 26/42] Work towards symlinks support We are not extracting symlinks, though it could be useful in the future for some cases. Signed-off-by: Philippe Ombredanne --- src/extractcode/vmimage.py | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index c9e5ae0..c588391 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -190,7 +190,7 @@ def listfs(self, skip_partitions=('swap',)): def extract_image(self, target_tarball): """ - Extract all files from this VM image in the `target_tarball` file as a + Extract all files from this VM image in the `target_tarball` file as a gzipped-compressed tarball (.tar.gz). Raise exception on errors. """ args = [ @@ -230,25 +230,36 @@ def run_guestfish(self, args, timeout=None): import subprocess full_args = [self.guestfish_command] + args try: - stdout = subprocess.check_output(full_args, timeout=timeout, stderr=subprocess.STDOUT) + stdout = subprocess.check_output( + full_args, + timeout=timeout, + stderr=subprocess.STDOUT, + ) except subprocess.CalledProcessError as cpe: args = ' '.join([self.guestfish_command] + args) output = as_unicode(cpe.output) - error = f'Failed to run guestfish to extract VM image: {args}\noutput: {output}' - raise ExtractErrorFailedToExtract(error) # from cpe + error = ( + f'Failed to run guestfish to extract VM image: {args}\n' + f'output: {output}' + ) + raise ExtractErrorFailedToExtract(error) return as_unicode(stdout) -def extract(location, target_dir, as_tarballs=False): +def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): """ Extract all files from a guestfish-supported VM image archive file at - location in the target_dir directory. Optionally only extract the - intermediate tarballs if `as_tarball` is True. Otherwise, extract to - intermediate tarballs and then extract each tarballs to the final directory. - + location in the target_dir directory. Return a list of warning messages if any or an empty list. + + Optionally only extract the intermediate tarballs if `as_tarball` is True. + Otherwise, extract to intermediate tarballs and then extract each tarballs + to the final directory. + + Optionally skip extracting symlinks. Raise exception on errors. + This works only on Linux. """ assert target_dir @@ -280,7 +291,8 @@ def extract(location, target_dir, as_tarballs=False): warns = extract_image_tarball( tarball=target_tarball, target_dir=target_dir, - skip_symlinks=False) + skip_symlinks=skip_symlinks, + ) warnings.extend(warns) except ExtractErrorFailedToExtract as e: @@ -308,7 +320,8 @@ def extract(location, target_dir, as_tarballs=False): warns = extract_image_tarball( tarball=target_tarball, target_dir=target_dir, - skip_symlinks=False) + skip_symlinks=skip_symlinks, + ) warnings.extend(warns) else: # with multiple partitions, we extract each partition to a unique @@ -333,7 +346,8 @@ def extract(location, target_dir, as_tarballs=False): warns = extract_image_tarball( tarball=target_tarball, target_dir=partition_target_dir, - skip_symlinks=False) + skip_symlinks=skip_symlinks, + ) warnings.extend(warns) return warnings From a0732862846d5423030f0e9671d6e3eab93ddb43 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Sun, 30 May 2021 22:13:15 +0200 Subject: [PATCH 27/42] Format code and streamline license headers Signed-off-by: Philippe Ombredanne --- NOTICE | 21 +++++------------- src/extractcode/NOTICE | 21 +++++------------- src/extractcode/__init__.py | 37 +++++++++++-------------------- src/extractcode/api.py | 25 ++++++--------------- src/extractcode/archive.py | 21 +++++------------- src/extractcode/cli.py | 22 +++++------------- src/extractcode/extract.py | 21 +++++------------- src/extractcode/libarchive2.py | 21 +++++------------- src/extractcode/patch.py | 21 +++++------------- src/extractcode/sevenzip.py | 21 +++++------------- src/extractcode/uncompress.py | 21 +++++------------- src/extractcode/vmimage.py | 21 +++++------------- tests/extractcode_assert_utils.py | 22 +++++------------- tests/test_archive.py | 26 +++++++--------------- tests/test_extract.py | 21 +++++------------- tests/test_extractcode.py | 22 +++++------------- tests/test_extractcode_cli.py | 22 +++++------------- tests/test_libarchive2.py | 23 +++++-------------- tests/test_patch.py | 22 +++++------------- tests/test_sevenzip.py | 21 +++++------------- tests/test_vmimage.py | 22 +++++------------- 21 files changed, 119 insertions(+), 355 deletions(-) diff --git a/NOTICE b/NOTICE index 65936b2..ad3576b 100644 --- a/NOTICE +++ b/NOTICE @@ -1,19 +1,8 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # diff --git a/src/extractcode/NOTICE b/src/extractcode/NOTICE index 65936b2..ad3576b 100644 --- a/src/extractcode/NOTICE +++ b/src/extractcode/NOTICE @@ -1,19 +1,8 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # diff --git a/src/extractcode/__init__.py b/src/extractcode/__init__.py index 7b97c83..93d910d 100644 --- a/src/extractcode/__init__.py +++ b/src/extractcode/__init__.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import logging @@ -66,18 +55,18 @@ # note: we do not include special_package in all_kinds by default all_kinds = ( - regular, - regular_nested, - package, - file_system, - docs, - patches, + regular, + regular_nested, + package, + file_system, + docs, + patches, special_package, ) default_kinds = ( - regular, - regular_nested, + regular, + regular_nested, package, ) diff --git a/src/extractcode/api.py b/src/extractcode/api.py index dc4f121..d250f86 100644 --- a/src/extractcode/api.py +++ b/src/extractcode/api.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # """ @@ -42,13 +31,13 @@ def extract_archives( Note: this API is returning an iterable and NOT a sequence. """ - + from extractcode.extract import extract from extractcode import default_kinds from extractcode import all_kinds kinds = all_kinds if all_formats else default_kinds - + for xevent in extract( location=location, kinds=kinds, diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index fcc73a5..8b6f284 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # from collections import namedtuple diff --git a/src/extractcode/cli.py b/src/extractcode/cli.py index 0f31ee1..df353ed 100644 --- a/src/extractcode/cli.py +++ b/src/extractcode/cli.py @@ -1,22 +1,10 @@ - -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 # -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import os diff --git a/src/extractcode/extract.py b/src/extractcode/extract.py index bb93203..774847d 100644 --- a/src/extractcode/extract.py +++ b/src/extractcode/extract.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import logging diff --git a/src/extractcode/libarchive2.py b/src/extractcode/libarchive2.py index 3592efb..ce330a5 100644 --- a/src/extractcode/libarchive2.py +++ b/src/extractcode/libarchive2.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # from functools import partial diff --git a/src/extractcode/patch.py b/src/extractcode/patch.py index 8a197ca..47fbf28 100644 --- a/src/extractcode/patch.py +++ b/src/extractcode/patch.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import posixpath diff --git a/src/extractcode/sevenzip.py b/src/extractcode/sevenzip.py index afae3e4..c044e49 100644 --- a/src/extractcode/sevenzip.py +++ b/src/extractcode/sevenzip.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # from collections import defaultdict diff --git a/src/extractcode/uncompress.py b/src/extractcode/uncompress.py index f584e84..62da56c 100644 --- a/src/extractcode/uncompress.py +++ b/src/extractcode/uncompress.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import bz2 diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index c588391..3c39828 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import logging diff --git a/tests/extractcode_assert_utils.py b/tests/extractcode_assert_utils.py index 537cf1d..756ecc3 100644 --- a/tests/extractcode_assert_utils.py +++ b/tests/extractcode_assert_utils.py @@ -1,22 +1,10 @@ - -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 # -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import json diff --git a/tests/test_archive.py b/tests/test_archive.py index 7dee15f..3a731ae 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -1,22 +1,11 @@ # -*- coding: utf-8 -*- # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import os @@ -60,6 +49,7 @@ project_root = os.path.dirname(os.path.dirname(__file__)) + class TestGetExtractorTest(BaseArchiveTestCase): def test_get_extractors_1(self): @@ -225,7 +215,7 @@ def test_get_extractor_qcow2(self): expected = [archive.extract_vm_image] self.check_get_extractors(test_file, expected, kinds=()) - self.check_get_extractors(test_file, expected, kinds=(extractcode.file_system, )) + self.check_get_extractors(test_file, expected, kinds=(extractcode.file_system,)) self.check_get_extractors(test_file, expected, kinds=extractcode.all_kinds) def test_get_extractor_for_dia(self): @@ -307,7 +297,7 @@ def test_7zip_extract_can_extract_to_relative_paths(self): from extractcode.sevenzip import extract test_file = self.get_test_loc('archive/relative_path/basic.zip', copy=True) - + project_tmp = join(project_root, 'tmp') fileutils.create_dir(project_tmp) project_root_abs = abspath(project_root) diff --git a/tests/test_extract.py b/tests/test_extract.py index 58842cc..a0a3475 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -1,22 +1,11 @@ # -*- coding: utf-8 -*- # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import io diff --git a/tests/test_extractcode.py b/tests/test_extractcode.py index 0a7dafb..cf06d44 100644 --- a/tests/test_extractcode.py +++ b/tests/test_extractcode.py @@ -1,22 +1,10 @@ - -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 # -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # from os.path import dirname diff --git a/tests/test_extractcode_cli.py b/tests/test_extractcode_cli.py index a793959..c7cd6e4 100644 --- a/tests/test_extractcode_cli.py +++ b/tests/test_extractcode_cli.py @@ -1,22 +1,10 @@ - -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 # -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import os diff --git a/tests/test_libarchive2.py b/tests/test_libarchive2.py index 70ce34b..6bd7728 100644 --- a/tests/test_libarchive2.py +++ b/tests/test_libarchive2.py @@ -1,23 +1,12 @@ - -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 # -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# + import os from commoncode import fileutils diff --git a/tests/test_patch.py b/tests/test_patch.py index 6ecb3cc..5a70951 100644 --- a/tests/test_patch.py +++ b/tests/test_patch.py @@ -1,22 +1,10 @@ - -# -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 # -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import io diff --git a/tests/test_sevenzip.py b/tests/test_sevenzip.py index ee32439..d9ad11a 100644 --- a/tests/test_sevenzip.py +++ b/tests/test_sevenzip.py @@ -1,21 +1,10 @@ # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import os diff --git a/tests/test_vmimage.py b/tests/test_vmimage.py index 76fa9aa..6653cdd 100644 --- a/tests/test_vmimage.py +++ b/tests/test_vmimage.py @@ -1,22 +1,10 @@ -# -*- coding: utf-8 -*- # -# Copyright (c) nexB Inc. and others. -# SPDX-License-Identifier: Apache-2.0 -# -# Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +# Copyright (c) nexB Inc. and others. All rights reserved. # ScanCode is a trademark of nexB Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +# See http://www.apache.org/licenses/LICENSE-2.0 for the license text. +# See https://github.com/nexB/extractcode for support or download. +# See https://aboutcode.org for more information about nexB OSS projects. # import os From 6c00362204538e87de49ea4da2249f943c14acf6 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 00:02:45 +0200 Subject: [PATCH 28/42] Use new coomadn invocation for 7zip Signed-off-by: Philippe Ombredanne --- src/extractcode/sevenzip.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/extractcode/sevenzip.py b/src/extractcode/sevenzip.py index c044e49..0d7dbca 100644 --- a/src/extractcode/sevenzip.py +++ b/src/extractcode/sevenzip.py @@ -242,7 +242,7 @@ def extract_all_files_at_once(location, target_dir, arch_type='*', skip_symlinks ex_args = build_7z_extract_command( location=location, target_dir=target_dir, arch_type=arch_type) - rc, stdout, stderr = command.execute2(**ex_args) + rc, stdout, stderr = command.execute(**ex_args) if rc != 0: if TRACE: @@ -498,10 +498,9 @@ def list_entries(location, arch_type='*'): cmd_loc = get_command_location() - rc, stdout, stderr = command.execute2( + rc, stdout, stderr = command.execute( cmd_loc=cmd_loc, args=args, - lib_dir=lib_dir, env=timezone, to_files=True) From f358194462c372f2e68227e83ceabf30922388c0 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 00:03:29 +0200 Subject: [PATCH 29/42] Improve kernel settings doc for VMs Signed-off-by: Philippe Ombredanne --- README.rst | 7 +++---- src/extractcode/vmimage.py | 7 +++++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/README.rst b/README.rst index 3dd5972..5fdb1e9 100644 --- a/README.rst +++ b/README.rst @@ -64,6 +64,7 @@ file cannot be read as required by libguestfish. Run this command as a temporary and immediate fix:: + sudo chmod 0644 /boot/vmlinuz-* for k in /boot/vmlinuz-* do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$k done @@ -87,9 +88,9 @@ https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725/comments/3 :: 2. Set executable permissions:: - sudo chmod +x /etc/kernel/postinst.d/statoverride + sudo chmod +x /etc/kernel/postinst.d/statoverride -See also for a complete discussion: +See also these links for a complete discussion: - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 @@ -109,7 +110,6 @@ ExtractCode will use these environment variables if set: shared library used to support some of the archive formats. If not provided, ExtractCode will look for a plugin-provided libarchive library path. See https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. - If no plugin contributes libarchive, then a final attempt is made to look for it in the PATH using standard DLL loading techniques. @@ -117,7 +117,6 @@ ExtractCode will use these environment variables if set: some of the archive formats. If not provided, ExtractCode will look for a plugin-provided 7z executable path. See https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. - If no plugin contributes 7z, then a final attempt is made to look for it in the PATH. \ No newline at end of file diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index 3c39828..fb1740a 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -42,14 +42,17 @@ 'WARNING: guestfish executable is not installed. ' 'Unable to extract virtual machine image: you need to install the ' 'guestfish tool from libguestfs and extra FS drivers if needed. ' - 'See the ExtractCode README.rst and https://libguestfs.org/ for details.' + 'See the ExtractCode README.rst at ' + 'https://github.com/nexB/extractcode/blob/main/README.rst ' + 'and https://libguestfs.org/ for details.' ) GUESTFISH_KERNEL_NOT_READABLE = ( '''libguestfs requires the kernel executable to be readable. This is the case by default on most Linux distributions except on Ubuntu. Please follow the instructions in ExtractCode installation guide to make this happen. -See the ExtractCode README.rst for details. +See deatils in the ExtractCode README.rst at: +https://github.com/nexB/extractcode/blob/main/README.rst ' ''') EXTRACTCODE_GUESTFISH_PATH_ENVVAR = 'EXTRACTCODE_GUESTFISH_PATH' From 3aeb2ec68d313b75430539d9e4d2e57c53ef6998 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 11:24:39 +0200 Subject: [PATCH 30/42] Update format Signed-off-by: Philippe Ombredanne --- setup.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index f791084..f192f22 100644 --- a/setup.cfg +++ b/setup.cfg @@ -20,7 +20,7 @@ classifiers = Programming Language :: Python :: 3 :: Only Topic :: Software Development Topic :: Utilities -keywords = +keywords = utilities [options] @@ -43,4 +43,4 @@ testing = docs= Sphinx>=3.3.1 sphinx-rtd-theme>=0.5.0 - doc8>=0.8.1 \ No newline at end of file + doc8>=0.8.1 From 2c412e8222d4d615384a24e2ddc472b0c9703916 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 11:24:57 +0200 Subject: [PATCH 31/42] Add Python 3.9 to Travis Signed-off-by: Philippe Ombredanne --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index 1b52eb2..1a90a38 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,6 +13,7 @@ python: - "3.6" - "3.7" - "3.8" + - "3.9" # Scripts to run at install stage install: ./configure --dev From 69eec23792d59dbdc3a3acb1711884560cf27073 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 11:27:35 +0200 Subject: [PATCH 32/42] Format and remove spurious spaces From https://github.com/nexB/typecode/pull/20 Reported-by: Pierre Tardy Signed-off-by: Philippe Ombredanne --- configure.bat | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/configure.bat b/configure.bat index 8c497ba..80d0a43 100644 --- a/configure.bat +++ b/configure.bat @@ -9,7 +9,7 @@ @rem ################################ -@rem # A configuration script to set things up: +@rem # A configuration script to set things up: @rem # create a virtualenv and install or update thirdparty packages. @rem # Source this script for initial configuration @rem # Use configure --help for details @@ -48,7 +48,7 @@ set "CFG_BIN_DIR=%CFG_ROOT_DIR%\%VIRTUALENV_DIR%\Scripts" @rem ################################ -@rem # Set the quiet flag to empty if not defined +@rem # Set the quiet flag to empty if not defined if not defined CFG_QUIET ( set "CFG_QUIET= " ) @@ -65,8 +65,8 @@ if "%1" EQU "--dev" ( set "CFG_REQUIREMENTS=%DEV_REQUIREMENTS%" set CFG_DEV_MODE=1 ) -if "%1" EQU "--python" ( - echo "The --python is now DEPRECATED. Use the PYTHON_EXECUTABLE environment +if "%1" EQU "--python"( + echo "The --python option is now DEPRECATED. Use the PYTHON_EXECUTABLE environment" echo "variable instead. Run configure --help for details." exit /b 0 ) @@ -76,7 +76,7 @@ if "%1" EQU "--python" ( @rem # Use environment variables or a file if available. @rem # Otherwise the latest Python by default. if not defined PYTHON_EXECUTABLE ( - @rem # check for a file named PYTHON_EXECUTABLE + @rem # check for a file named PYTHON_EXECUTABLE if exist ""%CFG_ROOT_DIR%\PYTHON_EXECUTABLE"" ( set /p PYTHON_EXECUTABLE=<""%CFG_ROOT_DIR%\PYTHON_EXECUTABLE"" ) else ( From 1074c508cc2b579f83f97d0c4a063e362216608f Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 11:54:27 +0200 Subject: [PATCH 33/42] Install and configure libguesfs in CI Signed-off-by: Philippe Ombredanne --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 1d63431..ed2430f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -13,7 +13,7 @@ jobs: image_name: ubuntu-16.04 python_versions: ['3.6', '3.7', '3.8', '3.9'] test_suites: - all: tmp/bin/pytest -vvs + all: sudo chmod 0644 /boot/vmlinuz-* && sudo apt install libguestfs-tools && tmp/bin/pytest -vvs - template: etc/ci/azure-posix.yml parameters: @@ -21,7 +21,7 @@ jobs: image_name: ubuntu-18.04 python_versions: ['3.6', '3.7', '3.8', '3.9'] test_suites: - all: apt-get install libguestfs-tools && tmp/bin/pytest -n 2 -vvs + all: sudo chmod 0644 /boot/vmlinuz-* && sudo apt install libguestfs-tools && tmp/bin/pytest -n 2 -vvs - template: etc/ci/azure-posix.yml parameters: @@ -29,7 +29,7 @@ jobs: image_name: ubuntu-20.04 python_versions: ['3.6', '3.7', '3.8', '3.9'] test_suites: - all: apt-get install libguestfs-tools && tmp/bin/pytest -n 2 -vvs + all: sudo chmod 0644 /boot/vmlinuz-* && sudo apt install libguestfs-tools && tmp/bin/pytest -n 2 -vvs - template: etc/ci/azure-posix.yml parameters: From 08aa847cd440813ccaf5e34ffb753172a9fac2ea Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 12:07:39 +0200 Subject: [PATCH 34/42] Update doc for release Signed-off-by: Philippe Ombredanne --- AUTHORS.rst | 4 ++- CHANGELOG.rst | 41 +++++++++++++++++++++++++-- README.rst | 77 +++++++++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 114 insertions(+), 8 deletions(-) diff --git a/AUTHORS.rst b/AUTHORS.rst index 00bd7dc..eeaf661 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -2,10 +2,12 @@ The following organizations or individuals have contributed to this repo: - Abhishek Kumar @Abhishek-Dev09 - AlexB @a-tinsmith +- Konrad Weihmann @priv-kweihmann - Maximilian Huber @maxhbr - Michael Rupprecht @michaelrup - Philippe Ombredanne @pombredanne +- Pierre Tardy @tardyp - Qingmin Duanmu @qduanmu - Rakesh Balusa @balusarakesh - Ravi Jain @JRavi2 -- Steven Esser @majurg +- Steven Esser @majurg \ No newline at end of file diff --git a/CHANGELOG.rst b/CHANGELOG.rst index a720e4a..6269e88 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -4,10 +4,47 @@ Changelog v (next) -------- - - Add support for VMDK, QCOW and VDI VM image filesystems extraction + +v21.5.31 +-------- + +- Add support for VMDK, QCOW and VDI VM image filesystems extraction +- Add new configuration mechanism to get third-party binary paths: + + - Use an environment variable + - Or use a plugin-provided path + - Or use well-known system installation locations + - Or use the system PATH + - Or fail with an informative error message + +- Update to use latest skeleton + + +v2021-2-24 +---------- + +- Fix incorrect documentation link + + +v2021-1-21 +---------- + +- Fix bug related to CommonCode libraries loading +- Improve the extra requirements +- Set minimum version for dependencies +- Improve documentation +- Reorganize tests files + + +v2021-1-15 +---------- + +- Drop support for Python 2 +- Use the latest CommonCode and TypeCode libraries +- Add azure-pipelines CI support v20.10 ------ - - Initial release as a split from ScanCode toolkit +- Initial release as a split from ScanCode toolkit diff --git a/README.rst b/README.rst index 5fdb1e9..e71246b 100644 --- a/README.rst +++ b/README.rst @@ -4,16 +4,16 @@ ExtractCode - license: Apache-2.0 - copyright: copyright (c) nexB. Inc. and others - homepage_url: https://github.com/nexB/extractcode -- keywords: archive, extraction, libarchive, 7zip, scancode-toolkit +- keywords: archive, extraction, libarchive, 7zip, scancode-toolkit, extractcode ExtractCode is a universal archive extractor. It uses behind the scenes multiple tools such as: -- the Python standard library, +- the Python standard library, - a custom ctypes binding to libarchive, -- the 7zip command line -- optionally libguestfs on Linux +- the 7zip command line, and +- optionally libguestfs on Linux. With these it is possible to extract a large number of common and @@ -28,17 +28,84 @@ binding to libmagic) to select the most appropriate extractor or decompressor function. It can handle multi-level archives such as tar.gz and can extract recursively nested archives. - Visit https://aboutcode.org and https://github.com/nexB/ for support and download. +We run CI tests on: + + - Azure pipelines https://dev.azure.com/nexB/extractcode/_build + +We run CI tests on: + + - Azure pipelines https://dev.azure.com/nexB/extractcode/_build + +To install this package with its full capability (where the binaries for +7zip and libarchive are installed), use the `full` option:: + + pip install extractcode[full] + +If you want to use the version of binaries (possibly) provided by your operating +system, use the `minimal` option:: + + pip install extractcode + +In this case, you will need to provide a working libarchive and 7zip +available in one of these ways: + +- **a typecode-libarchive and typecode-7z plugin**: See the standard ones at + https://github.com/nexB/scancode-plugins/tree/main/builtins + These can either bundle a libarchive library, a 7z executable or expose a + system-installed libraries. + It does so by providing plugin entry points as ``scancode_location_provider`` + for ``extractcode_libarchive`` that should point to a ``LocationProviderPlugin`` + subclass with a ``get_locations()`` method that must return a mapping with this key: + + - 'extractcode.libarchive.dll': the absolute path to a libarchive DLL + + See for example: + + - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_libarchive-linux/setup.py#L40 + - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_libarchive-linux/src/extractcode_libarchive/__init__.py#L17 + + And the ``scancode_location_provider`` for ``extractcode_7zip`` should point + to a ``LocationProviderPlugin`` subclass with a ``get_locations()`` method that must + return a mapping with this key: + + - 'extractcode.sevenzip.exe': the absolute path to a 7zip executable + + See for example: + + - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_7z-linux/setup.py#L40 + - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_7z-linux/src/extractcode_7z/__init__.py#L18 + +- **environment variables**: + + - EXTRACTCODE_LIBARCHIVE_PATH: the absolute path to a libarchive DLL + - EXTRACTCODE_7Z_PATH: the absolute path to a 7zip executable + + +- **a system-installed libarchive and 7zip executable in the system PATH**: + + +The supported versions are: + +- libarchive 3.5.x +- 7zip 16.5.x + + +Development +----------- + + To set up the development environment:: source configure + To run unit tests:: pytest -vvs -n 2 + To clean up development environment:: ./configure --clean From 28528df99ca5cac7ad25b2da03c005512c037369 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 13:21:21 +0200 Subject: [PATCH 35/42] Skip vm image tests on Travis Also support Python 3.9 Signed-off-by: Philippe Ombredanne --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 1b52eb2..8f9412e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,9 +13,10 @@ python: - "3.6" - "3.7" - "3.8" + - "3.9" # Scripts to run at install stage install: ./configure --dev # Scripts to run at script stage -script: tmp/bin/pytest +script: tmp/bin/pytest --ignore=tests/test_vmimage.py From 0e09ad9eb77ca0b580d71baa428955a0a56d19f1 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 19:17:43 +0200 Subject: [PATCH 36/42] Bump to more modern version of setuptools_scm And remove v prefix from fallback version Signed-off-by: Philippe Ombredanne --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8eebe91..852f0fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,11 +1,11 @@ [build-system] -requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 4"] +requires = ["setuptools >= 50", "wheel", "setuptools_scm[toml] >= 6"] build-backend = "setuptools.build_meta" [tool.setuptools_scm] # this is used populated when creating a git archive # and when there is .git dir and/or there is no git installed -fallback_version = "v9999.$Format:%h-%cs$" +fallback_version = "9999.$Format:%h-%cs$" [tool.pytest.ini_options] norecursedirs = [ From e339a70e1a46b613fa73b9d0a9273fe7640acb8d Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 19:18:09 +0200 Subject: [PATCH 37/42] Add space for correct syntax Signed-off-by: Philippe Ombredanne --- configure.bat | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configure.bat b/configure.bat index 80d0a43..c12f937 100644 --- a/configure.bat +++ b/configure.bat @@ -65,7 +65,7 @@ if "%1" EQU "--dev" ( set "CFG_REQUIREMENTS=%DEV_REQUIREMENTS%" set CFG_DEV_MODE=1 ) -if "%1" EQU "--python"( +if "%1" EQU "--python" ( echo "The --python option is now DEPRECATED. Use the PYTHON_EXECUTABLE environment" echo "variable instead. Run configure --help for details." exit /b 0 From a0a1436a949473f4b8bd2b6b0657947591fde31b Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 19:53:08 +0200 Subject: [PATCH 38/42] Sort imports Signed-off-by: Philippe Ombredanne --- src/extractcode/vmimage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index fb1740a..24c2bbf 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -16,8 +16,8 @@ import attr from commoncode import fileutils -from commoncode.text import as_unicode from commoncode.system import on_linux +from commoncode.text import as_unicode from extractcode import ExtractErrorFailedToExtract From 9d9d1a16934a4495109d5c9755bd230c80650512 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Mon, 31 May 2021 21:53:56 +0200 Subject: [PATCH 39/42] Improve failure reporting Signed-off-by: Philippe Ombredanne --- tests/test_extractcode_cli.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_extractcode_cli.py b/tests/test_extractcode_cli.py index c7cd6e4..91bf597 100644 --- a/tests/test_extractcode_cli.py +++ b/tests/test_extractcode_cli.py @@ -70,13 +70,17 @@ def test_extractcode_command_does_extract_verbose(): result = run_extract(['--verbose', test_dir], expected_rc=1) assert os.path.exists(os.path.join(test_dir, 'some.tar.gz-extract')) - assert 'Extracting archives...' in result.stderr - assert 'some.tar.gz' in result.stdout - assert 'broken.tar.gz' in result.stderr - assert 'tarred_gzipped.tgz' in result.stdout - assert 'ERROR extracting' in result.stderr - assert "broken.tar.gz: Unrecognized archive format" in result.stderr - assert 'Extracting done.' in result.stderr + try: + assert 'some.tar.gz' in result.stdout + assert 'tarred_gzipped.tgz' in result.stdout + + assert 'Extracting archives...' in result.stderr + assert 'ERROR extracting' in result.stderr + assert 'broken.tar.gz' in result.stderr + assert "broken.tar.gz: Unrecognized archive format" in result.stderr + assert 'Extracting done.' in result.stderr + except: + assert [result.stderr, result.stdout] == [] def test_extractcode_command_always_shows_something_if_not_using_a_tty_verbose_or_not(): From 6e765bf13f481b585f7834870df3014bcca721f1 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 1 Jun 2021 09:38:56 +0200 Subject: [PATCH 40/42] Add test documentation Signed-off-by: Philippe Ombredanne --- tests/test_extractcode_cli.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_extractcode_cli.py b/tests/test_extractcode_cli.py index 91bf597..6ef6b74 100644 --- a/tests/test_extractcode_cli.py +++ b/tests/test_extractcode_cli.py @@ -32,6 +32,8 @@ def run_extract(options, expected_rc=None, cwd=None): Run extractcode as a plain subprocess. Return rc, stdout, stderr. """ bin_dir = 'Scripts' if on_windows else 'bin' + # note: this assumes that we are using a standard directory layout as set + # with the configure script cmd_loc = os.path.join(project_root, 'tmp', bin_dir, 'extractcode') assert os.path.exists(cmd_loc + ('.exe' if on_windows else '')) args = [cmd_loc] + options From bbbffbcfdbf9edf29f62cd4cd6a443d169ce5394 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 1 Jun 2021 10:46:06 +0200 Subject: [PATCH 41/42] Update documentation Signed-off-by: Philippe Ombredanne --- CHANGELOG.rst | 2 +- README.rst | 183 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 126 insertions(+), 59 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6269e88..f666fe2 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -5,7 +5,7 @@ v (next) -------- -v21.5.31 +v21.6.1 -------- - Add support for VMDK, QCOW and VDI VM image filesystems extraction diff --git a/README.rst b/README.rst index e71246b..23ae9cf 100644 --- a/README.rst +++ b/README.rst @@ -6,40 +6,89 @@ ExtractCode - homepage_url: https://github.com/nexB/extractcode - keywords: archive, extraction, libarchive, 7zip, scancode-toolkit, extractcode +Supports Windows, Linux and macOS on 64 bits processors and Python 3.6 to 3.9. -ExtractCode is a universal archive extractor. It uses behind the scenes -multiple tools such as: + +**ExtractCode is a (mostly) universal archive extractor.** + +Install with:: + + pip install extractcode[full] + + +Why another extractor? +---------------------- + +**it will extract!** + +ExtractCode will extract things where other extractors may fail. + +- Say you want to extract the tarball of the Linux kernel source code on Windows. + It contains paths that are the same when ignoring the case and therefore will + not extract OK on Windows: some file may be munged or the extract may file. + +- Or a tarball (on any OS) may contain multiple times the exact same path. In + these cases the paths showing up earlier in the archive may be "hidden" and + overwritten by the same path showing up later in the archive giving the + impression that there is only one file. + +- Or an archive may be damaged a little but most files can still be extracted. + +- Or the extracted files are such permissions that you cannot read them and are + not owned by you. + +- Or the archive may contain weird paths inluding relative paths that may be + problematic to extract. + +- Or the archive may contain special file types (character/device files) that + may be problematic to extract. + +- Or an archive may be a virtual disk or some file system(s) images that would + typically need to be mounted to be accessed, and may require root access + and guesswork to find out which partition and filesystem are at play and + which driver to use. + +In all these cases, ExtractCode will extract and try hard do the right thing to +obtain the actual archived content when other tools may fail. + +It can also extract recursively any type of (nested) archives-in-archives + +As a downside, the extracted content may not be exactly what would be expected +to use the contained files: for instance ... but this it is perfectly OK for +file content analysis for software composition or forensic analysis. + +Behind the scene, ExtractCode uses multiple tools such as: - the Python standard library, - a custom ctypes binding to libarchive, -- the 7zip command line, and +- the 7zip command line tool, and - optionally libguestfs on Linux. -With these it is possible to extract a large number of common and - -less common archives and compressed files. ExtractCode tries to extract things -in the same way on all OSes, including auto-renaming files that would not have -valid names on certain filesystems or when there are multiple copies of the same -path in a given archive (which is possible in a tar). +With these, it is possible to extract a large number of common and less common +archives and compressed file types. ExtractCode tries to extract things in the +same way on all supported OSes, including auto-renaming files that would have +invalid, non-extractible names on certain filesystems or when there are multiple +copies of the same path in a given archive (which is possible in a tar). -The extraction is driven from a "voting" system that considers the -file extension(s) and name, the filetype and mimetype (using a ctypes -binding to libmagic) to select the most appropriate extractor or -decompressor function. It can handle multi-level archives such as tar.gz and -can extract recursively nested archives. +The extraction is driven from a "voting" system that considers the file +extension(s) and name, the filetype and mimetype (using a ctypes binding to +libmagic) to select the most appropriate extractor or decompressor function. +It can handle multi-level archives such as tar.gz and can extract recursively +any nested archives. Visit https://aboutcode.org and https://github.com/nexB/ for support and download. + We run CI tests on: - Azure pipelines https://dev.azure.com/nexB/extractcode/_build -We run CI tests on: - - Azure pipelines https://dev.azure.com/nexB/extractcode/_build +Installation +------------ To install this package with its full capability (where the binaries for -7zip and libarchive are installed), use the `full` option:: +7zip and libarchive are installed), use the `full` extra option:: pip install extractcode[full] @@ -48,45 +97,47 @@ system, use the `minimal` option:: pip install extractcode -In this case, you will need to provide a working libarchive and 7zip -available in one of these ways: +In this case, you will need to provide a working and compatible libarchive and +7zip installed and configured in one of these ways such that ExtractCode can +find them: -- **a typecode-libarchive and typecode-7z plugin**: See the standard ones at +- **a typecode-libarchive and typecode-7z plugin**: See the standard ones at https://github.com/nexB/scancode-plugins/tree/main/builtins These can either bundle a libarchive library, a 7z executable or expose a system-installed libraries. It does so by providing plugin entry points as ``scancode_location_provider`` for ``extractcode_libarchive`` that should point to a ``LocationProviderPlugin`` - subclass with a ``get_locations()`` method that must return a mapping with this key: + subclass with a ``get_locations()`` method that must return a mapping with + this key: - - 'extractcode.libarchive.dll': the absolute path to a libarchive DLL + - 'extractcode.libarchive.dll': the absolute path to a **libarchive** shared object/DLL See for example: - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_libarchive-linux/setup.py#L40 - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_libarchive-linux/src/extractcode_libarchive/__init__.py#L17 - And the ``scancode_location_provider`` for ``extractcode_7zip`` should point - to a ``LocationProviderPlugin`` subclass with a ``get_locations()`` method that must - return a mapping with this key: + And in the same way, the ``scancode_location_provider`` for ``extractcode_7zip`` + should point to a ``LocationProviderPlugin`` subclass with a ``get_locations()`` + method that must return a mapping with this key: - - 'extractcode.sevenzip.exe': the absolute path to a 7zip executable + - 'extractcode.sevenzip.exe': the absolute path to a **7zip** executable See for example: - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_7z-linux/setup.py#L40 - https://github.com/nexB/scancode-plugins/blob/4da5fe8a5ab1c87b9b4af9e54d7ad60e289747f5/builtins/extractcode_7z-linux/src/extractcode_7z/__init__.py#L18 -- **environment variables**: +- use **environment variables** to point to installed binaries: - EXTRACTCODE_LIBARCHIVE_PATH: the absolute path to a libarchive DLL - EXTRACTCODE_7Z_PATH: the absolute path to a 7zip executable -- **a system-installed libarchive and 7zip executable in the system PATH**: +- **a system-installed libarchive and 7zip executable** available in the system **PATH**. -The supported versions are: +The supported binary tools versions are: - libarchive 3.5.x - 7zip 16.5.x @@ -95,10 +146,9 @@ The supported versions are: Development ----------- - To set up the development environment:: - source configure + source configure --dev To run unit tests:: @@ -116,18 +166,43 @@ To run the command line tool in the activated environment:: ./extractcode -h +Configuration with environment variables +---------------------------------------- + +ExtractCode will use these environment variables if set: + +- EXTRACTCODE_LIBARCHIVE_PATH : the path to the ``libarchive.so`` libarchive + shared library used to support some of the archive formats. If not provided, + ExtractCode will look for a plugin-provided libarchive library path. See + https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. + If no plugin contributes libarchive, then a final attempt is made to look for + it in the PATH using standard DLL loading techniques. + +- EXTRACTCODE_7Z_PATH : the path to the ``7z`` 7zip executable used to support + some of the archive formats. If not provided, ExtractCode will look for a + plugin-provided 7z executable path. See + https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. + If no plugin contributes 7z, then a final attempt is made to look for + it in the PATH. + +- EXTRACTCODE_GUESTFISH_PATH : the path to the ``guestfish`` tool from + libguestfs to use to extract VM images. If not provided, ExtractCode will look + in the PATH for an installed ``guestfish`` executable instead. + + + Adding support for VM images extraction --------------------------------------- -Adding support for VM images requires the manual installation of libguestfs -tools system package. This is suport on Linux only. On Debian and Ubuntu you can -use this:: +Adding support for VM images requires the manual installation of the +libguestfs-tools system package. This is suported only on Linux. +On Debian and Ubuntu you can use this command:: sudo apt-get install libguestfs-tools On Ubuntu only, an additional manual step is required as the kernel executable -file cannot be read as required by libguestfish. +file cannot be read by users as required by libguestfish. Run this command as a temporary and immediate fix:: @@ -136,10 +211,9 @@ Run this command as a temporary and immediate fix:: do sudo dpkg-statoverride --add --update root root 0644 /boot/vmlinuz-$k done - -But you likely want both this temporary fix and a permanent fix; otherwise each -kernel update will revert to the default permissions and extractcode will stop -working for VM images extraction. +You likely want both this temporary fix and a more permanent fix; otherwise each +kernel update will revert to the default permissions and ExtractCode will stop +working for VM images extraction. Therefore follow these instructions: @@ -164,26 +238,19 @@ See also these links for a complete discussion: - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662/comments/24 -Configuration with environment variables ----------------------------------------- +Alternative +----------- -ExtractCode will use these environment variables if set: +These other tools are related and were considered before creating ExtractCode: -- EXTRACTCODE_GUESTFISH_PATH : the path to the ``guestfish`` tool from - libguestfs to use to extract VM images. If not provided, ExtractCode will look - in the PATH for an installed ``guestfish`` executable instead. +These tools provide built-in, original extraction capabilities: -- EXTRACTCODE_LIBARCHIVE_PATH : the path to the ``libarchive.so`` libarchive - shared library used to support some of the archive formats. If not provided, - ExtractCode will look for a plugin-provided libarchive library path. See - https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. - If no plugin contributes libarchive, then a final attempt is made to look for - it in the PATH using standard DLL loading techniques. +- https://libarchive.org/ (integrated in ExtractCode) (BSD license) +- https://www.7-zip.org/ (integrated in ExtractCode) (LGPL license) +- https://theunarchiver.com/command-line (maintenance status unknown) (LGPL license) -- EXTRACTCODE_7Z_PATH : the path to the ``7z`` 7zip executable used to support - some of the archive formats. If not provided, ExtractCode will look for a - plugin-provided 7z executable path. See - https://github.com/nexB/scancode-plugins/tree/main/builtins for such plugins. - If no plugin contributes 7z, then a final attempt is made to look for - it in the PATH. - \ No newline at end of file +These tools are command line tools wrapping other extraction tools and are +similar to ExtractCode but with different goals: + +- https://github.com/wummel/patool (wrapper on many CLI tools) (GPL license) +- https://github.com/dtrx-py/dtrx (wrapper on a few CLI tools) (recently revived) (GPL license) From 4a8ef69db5a901cbb8d588a24d61e1e23c815f05 Mon Sep 17 00:00:00 2001 From: Philippe Ombredanne Date: Tue, 1 Jun 2021 11:52:20 +0200 Subject: [PATCH 42/42] Improve documentation and readability Apply formatting and minor refactoring. Refine and carify documentation. Signed-off-by: Philippe Ombredanne --- extractcode.ABOUT | 8 +- setup.cfg | 23 +++- src/extractcode/__init__.py | 8 +- src/extractcode/archive.py | 75 ++++++++++--- src/extractcode/cli.py | 102 ++++++++++++++--- src/extractcode/extract.py | 56 +++++++--- src/extractcode/libarchive2.py | 113 +++++++++++-------- src/extractcode/sevenzip.py | 179 ++++++++++++++++++------------ src/extractcode/uncompress.py | 11 +- src/extractcode/vmimage.py | 85 ++++++++------ tests/extractcode_assert_utils.py | 43 ++++--- tests/test_archive.py | 16 ++- tests/test_extract.py | 1 - tests/test_libarchive2.py | 7 +- tests/test_sevenzip.py | 10 +- 15 files changed, 500 insertions(+), 237 deletions(-) diff --git a/extractcode.ABOUT b/extractcode.ABOUT index 7ebfb73..75e231d 100644 --- a/extractcode.ABOUT +++ b/extractcode.ABOUT @@ -1,9 +1,9 @@ about_resource: . copyright: copyright (c) nexB. Inc. and others -description: A mostly universal archive extractor using z7zip, libarchve, other - libraries and the Python standard library for reliable archive extraction. - It is used by ScanCode toolkit and related projects -keywords: archive, extraction, libarchive, 7zip, scancode-toolkit +description: A mostly universal archive extractor using 7zip, libarchive and the + Python standard library for reliable archive extraction on Linux, Windows and + macOS. It is used by ScanCode toolkit and related projects. +keywords: archive, extraction, libarchive, 7zip, gzip, xz, lzma, bzip2, tar, ar, cpio, scancode-toolkit homepage_url: https://github.com/nexB/extractcode holder: nexB. Inc. and others holder_contact: info@aboutcode.org diff --git a/setup.cfg b/setup.cfg index 9aa271a..4276949 100644 --- a/setup.cfg +++ b/setup.cfg @@ -10,7 +10,7 @@ author_email = info@aboutcode.org license = Apache-2.0 # description must be on ONE line https://github.com/pypa/setuptools/issues/1390 -description = A mostly universal archive extractor using z7zip, libarchve, other libraries and the Python standard library for reliable archive extraction. +description = A mostly universal archive extractor using 7zip, libarchive and the Python standard library for reliable archive extraction. long_description = file:README.rst url = https://github.com/nexB/extractcode classifiers = @@ -26,6 +26,27 @@ keywords = extraction libarchive 7zip + 7z + gzip + bzip2 + xz + lzma + lz4 + lzip + zstd + Z + tar + xar + ar + cpio + vmdk + qcow2 + vhd + iso + deb + cab + rpm + patch scancode-toolkit [options] diff --git a/src/extractcode/__init__.py b/src/extractcode/__init__.py index 93d910d..fb6095d 100644 --- a/src/extractcode/__init__.py +++ b/src/extractcode/__init__.py @@ -137,11 +137,11 @@ def remove_backslashes_and_dotdots(directory): def new_name(location, is_dir=False): """ Return a new non-existing location from a `location` usable to write a file - or create directory without overwriting existing files or directories in the same - parent directory, ignoring the case of the filename. + or create directory without overwriting existing files or directories in the + same parent directory, ignoring the case of the filename. - The case of the filename is ignored to ensure that similar results are returned - across case sensitive (*nix) and case insensitive file systems. + The case of the filename is ignored to ensure that similar results are + returned across case sensitive (*nix) and case insensitive file systems. To find a new unique filename, this tries new names this way: * pad a directory name with _X where X is an incremented number. diff --git a/src/extractcode/archive.py b/src/extractcode/archive.py index 8b6f284..ae5e268 100644 --- a/src/extractcode/archive.py +++ b/src/extractcode/archive.py @@ -15,7 +15,6 @@ from commoncode import filetype from commoncode import functional from commoncode.ignore import is_ignored - from typecode import contenttype from extractcode import all_kinds @@ -204,7 +203,9 @@ def get_handlers(location): mtype = T.mimetype_file if TRACE_DEEP: - logger.debug('get_handlers: processing %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals()) + logger.debug( + 'get_handlers: processing %(location)s: ' + 'ftype: %(ftype)s, mtype: %(mtype)s ' % locals()) for handler in archive_handlers: if not handler.extractors: continue @@ -223,9 +224,19 @@ def get_handlers(location): extension_matched = exts and location.lower().endswith(exts) if TRACE_DEEP: - print(f' get_handlers: matched type: {type_matched}, mime: {mime_matched}, ext: {extension_matched}' % locals()) - - if handler.strict and not (type_matched and mime_matched and extension_matched): + print( + f' get_handlers: matched type: {type_matched}, ' + f'mime: {mime_matched}, ext: {extension_matched}' % locals() + ) + + if ( + handler.strict + and not ( + type_matched + and mime_matched + and extension_matched + ) + ): if TRACE_DEEP: print(f' get_handlers: skip strict: {handler.name}') continue @@ -449,17 +460,30 @@ def try_to_extract(location, target_dir, extractor): extract_deb = libarchive2.extract -# sevenzip is best for windows lib formats and works fine otherwise. libarchive works on standard ar formats. -extract_ar = functional.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract) +# sevenzip is best for windows lib formats and works fine otherwise. libarchive +# works on standard ar formats. +extract_ar = functional.partial( + extract_with_fallback, + extractor1=libarchive2.extract, + extractor2=sevenzip.extract, +) extract_msi = sevenzip.extract extract_cpio = libarchive2.extract # sevenzip should be best at extracting 7zip but most often libarchive is better first -extract_7z = functional.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract) +extract_7z = functional.partial( + extract_with_fallback, + extractor1=libarchive2.extract, + extractor2=sevenzip.extract, +) # libarchive is best for the run of the mill zips, but sevenzip sometimes is better -extract_zip = functional.partial(extract_with_fallback, extractor1=libarchive2.extract, extractor2=sevenzip.extract) +extract_zip = functional.partial( + extract_with_fallback, + extractor1=libarchive2.extract, + extractor2=sevenzip.extract, +) extract_springboot = functional.partial(try_to_extract, extractor=extract_zip) @@ -515,7 +539,12 @@ def try_to_extract(location, target_dir, extractor): OfficeDocHandler = Handler( name='Office doc', - filetypes=('zip archive', 'microsoft word 2007+', 'microsoft excel 2007+', 'microsoft powerpoint 2007+'), + filetypes=( + 'zip archive', + 'microsoft word 2007+', + 'microsoft excel 2007+', + 'microsoft powerpoint 2007+', + ), mimetypes=('application/zip', 'application/vnd.openxmlformats',), # Extensions of office documents that are zip files too extensions=( @@ -553,7 +582,7 @@ def try_to_extract(location, target_dir, extractor): strict=True ) - # see http://tools.android.com/tech-docs/new-build-system/aar-formats +# see http://tools.android.com/tech-docs/new-build-system/aar-formats AndroidLibHandler = Handler( name='Android library', filetypes=('zip archive',), @@ -827,8 +856,16 @@ def try_to_extract(location, target_dir, extractor): name='Tar bzip2', filetypes=('bzip2 compressed',), mimetypes=('application/x-bzip2',), - extensions=('.tar.bz2', '.tar.bz', '.tar.bzip', '.tar.bzip2', - '.tbz', '.tbz2', '.tb2', '.tarbz2',), + extensions=( + '.tar.bz2', + '.tar.bz', + '.tar.bzip', + '.tar.bzip2', + '.tbz', + '.tbz2', + '.tb2', + '.tarbz2', + ), kind=regular_nested, extractors=[extract_tar], strict=False @@ -876,10 +913,11 @@ def try_to_extract(location, target_dir, extractor): NugetHandler = Handler( name='Nuget', - # weirdly enough the detection by libmagic is sometimes wrong - # TODO file a bug upstream - # this is due to this: https://en.wikipedia.org/wiki/Open_Packaging_Conventions#File_formats_using_the_OPC + # TODO: file a bug upstream + # Weirdly enough the detection by libmagic is sometimes wrong + # this is due to this issue: # being recognized by libmagic as an OOXML file + # https://en.wikipedia.org/wiki/Open_Packaging_Conventions#File_formats_using_the_OPC filetypes=('zip archive', 'microsoft ooxml',), mimetypes=('application/zip', 'application/octet-stream',), extensions=('.nupkg',), @@ -921,7 +959,10 @@ def try_to_extract(location, target_dir, extractor): DebHandler = Handler( name='Debian package', filetypes=('debian binary package',), - mimetypes=('application/vnd.debian.binary-package', 'application/x-archive',), + mimetypes=( + 'application/vnd.debian.binary-package', + 'application/x-archive', + ), extensions=('.deb', '.udeb',), kind=package, extractors=[extract_deb], diff --git a/src/extractcode/cli.py b/src/extractcode/cli.py index df353ed..b4064b3 100644 --- a/src/extractcode/cli.py +++ b/src/extractcode/cli.py @@ -84,20 +84,73 @@ class ExtractCommand(cliutils.BaseCommand): @click.command(name='extractcode', epilog=epilog_text, cls=ExtractCommand) @click.pass_context -@click.argument('input', metavar='', type=click.Path(exists=True, readable=True)) - -@click.option('--verbose', is_flag=True, default=False, help='Print verbose file-by-file progress messages.') -@click.option('--quiet', is_flag=True, default=False, help='Do not print any summary or progress message.') -@click.option('--shallow', is_flag=True, default=False, help='Do not extract recursively nested archives (e.g. not archives in archives).') -@click.option('--replace-originals', is_flag=True, default=False, help='Replace extracted archives by the extracted content.') -@click.option('--ignore', default=[], multiple=True, help='Ignore files/directories following a glob-pattern.') -@click.option('--all-formats', is_flag=True, default=False, help='Extract archives from all known formats.') +@click.argument( + 'input', + metavar='', + type=click.Path(exists=True, readable=True), +) + +@click.option( + '--verbose', + is_flag=True, + help='Print verbose file-by-file progress messages.', +) +@click.option( + '--quiet', + is_flag=True, + help='Do not print any summary or progress message.', +) +@click.option( + '--shallow', + is_flag=True, + help='Do not extract recursively nested archives in archives.', +) +@click.option( + '--replace-originals', + is_flag=True, + help='Replace extracted archives by the extracted content.', +) +@click.option( + '--ignore', + default=[], + multiple=True, + help='Ignore files/directories matching this glob pattern.', +) + +@click.option( + '--all-formats', + is_flag=True, + help='Extract archives from all known formats.', +) @click.help_option('-h', '--help') -@click.option('--about', is_flag=True, is_eager=True, callback=print_about, help='Show information about ExtractCode and licensing and exit.') -@click.option('--version', is_flag=True, is_eager=True, callback=print_version, help='Show the version and exit.') -def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, all_formats, *args, **kwargs): # NOQA - """extract archives and compressed files found in the file or directory tree. +@click.option( + '--about', + is_flag=True, + is_eager=True, + callback=print_about, + help='Show information about ExtractCode and its licensing and exit.', +) +@click.option( + '--version', + is_flag=True, + is_eager=True, + callback=print_version, + help='Show the version and exit.', +) +def extractcode( + ctx, + input, # NOQA + verbose, + quiet, + shallow, + replace_originals, + ignore, + all_formats, + *args, + **kwargs, +): + """extract archives and compressed files in the file or directory tree. Archives found inside an extracted archive are extracted recursively. Use --shallow for a shallow extraction. @@ -105,7 +158,11 @@ def extractcode(ctx, input, verbose, quiet, shallow, replace_originals, ignore, '-extract' created side-by-side with an archive. """ - abs_location = fileutils.as_posixpath(os.path.abspath(os.path.expanduser(input))) + abs_location = fileutils.as_posixpath( + os.path.abspath( + os.path.expanduser(input) + ) + ) def extract_event(item): """ @@ -159,10 +216,16 @@ def display_extract_summary(): ) for e in xev.errors: - echo_stderr('ERROR extracting: %(source)s: %(e)s' % locals(), fg='red') + echo_stderr( + 'ERROR extracting: %(source)s: %(e)s' % locals(), + fg='red' + ) for warn in xev.warnings: - echo_stderr('WARNING extracting: %(source)s: %(warn)s' % locals(), fg='yellow') + echo_stderr( + 'WARNING extracting: %(source)s: %(warn)s' % locals(), + fg='yellow' + ) summary_color = 'green' if has_warnings: @@ -190,6 +253,7 @@ def display_extract_summary(): if not quiet: echo_stderr('Extracting archives...', fg='green') + with cliutils.progressmanager(extractibles, item_show_func=extract_event, verbose=verbose) as extraction_events: @@ -199,7 +263,9 @@ def display_extract_summary(): if repr(xev) not in unique_extract_events_with_errors: extract_result_with_errors.append(xev) unique_extract_events_with_errors.add(repr(xev)) + display_extract_summary() + else: for xev in extractibles: if xev.done and (xev.warnings or xev.errors): @@ -211,9 +277,9 @@ def display_extract_summary(): def get_relative_path(path, len_base_path, base_is_dir): """ - Return a posix relative path from the posix 'path' relative to a - base path of `len_base_path` length where the base is a directory if - `base_is_dir` True or a file otherwise. + Return a posix relative path from the posix 'path' relative to a base path + of `len_base_path` length where the base is a directory if `base_is_dir` + True or a file otherwise. """ path = os.fsdecode(path) if base_is_dir: diff --git a/src/extractcode/extract.py b/src/extractcode/extract.py index 774847d..41d4ed6 100644 --- a/src/extractcode/extract.py +++ b/src/extractcode/extract.py @@ -31,15 +31,15 @@ logger.setLevel(logging.DEBUG) """ -Extract archives and compressed files recursively to get the file content available for -further processing. This the high level extraction entry point. +Extract archives and compressed files recursively to get the file content +available for further processing. This the high level extraction entry point. This is NOT a general purpose un-archiver. The code tries hard to do the right thing, BUT the extracted files are not meant to be something that can be -faithfully re-archived to get an equivalent archive. The purpose instead is -to extract the content of the archives as faithfully and safely as possible to -make this content available for scanning: some paths may be altered. Some files -may be altered or skipped entirely. +faithfully re-archived to get an equivalent archive. The purpose instead is to +extract the content of the archives as faithfully and safely as possible to make +this content available for scanning: some paths may be altered. Some files may +be altered or skipped entirely. In particular: @@ -127,14 +127,17 @@ def extract( if replace_originals: processed_events_append(event) - # move files around + # move files around when done if replace_originals: for xevent in reversed(processed_events): if xevent.done: source = xevent.source target = xevent.target if TRACE: - logger.debug('extract:replace_originals: replace %(source)r by %(target)r' % locals()) + logger.debug( + 'extract:replace_originals: replace ' + '%(source)r by %(target)r' % locals() + ) fileutils.delete(source) fileutils.copytree(target, source) fileutils.delete(target) @@ -164,7 +167,8 @@ def extract_files( abs_location = abspath(expanduser(location)) for top, dirs, files in fileutils.walk(abs_location, ignored): if TRACE: - logger.debug('extract:walk: top: %(top)r dirs: %(dirs)r files: r(files)r' % locals()) + logger.debug( + 'extract:walk: top: %(top)r dirs: %(dirs)r files: r(files)r' % locals()) if not recurse: if TRACE: @@ -173,7 +177,10 @@ def extract_files( if extractcode.is_extraction_path(d): dirs.remove(d) if TRACE: - logger.debug('extract:walk: not recurse: removed dirs:' + repr(drs.symmetric_difference(set(dirs)))) + logger.debug( + 'extract:walk: not recurse: removed dirs:' + +repr(drs.symmetric_difference(set(dirs))) + ) for f in files: loc = join(top, f) @@ -223,8 +230,8 @@ def extract_file( all_formats=False, ): """ - Extract a single archive at `location` in the `target` directory if it is - of a kind supported in the `kinds` kind tuple. + Extract a single archive at `location` in the `target` directory if it is of + a kind supported in the `kinds` kind tuple. """ warnings = [] errors = [] @@ -232,10 +239,20 @@ def extract_file( if TRACE: emodule = getattr(extractor, '__module__', '') ename = getattr(extractor, '__name__', '') - logger.debug(f'extract_file: extractor: for: {location} with kinds: {kinds}: {emodule}.{ename}') + logger.debug( + f'extract_file: extractor: for: {location} with kinds: ' + f'{kinds}: {emodule}.{ename}' + ) if extractor: - yield ExtractEvent(location, target, done=False, warnings=[], errors=[]) + yield ExtractEvent( + source=location, + target=target, + done=False, + warnings=[], + errors=[], + ) + try: # extract first to a temp directory: if there is an error, the # extracted files will not be moved to target @@ -251,7 +268,14 @@ def extract_file( errors.append(traceback.format_exc()) if TRACE: tb = traceback.format_exc() - logger.debug('extract_file: ERROR: %(location)r: %(errors)r\n%(e)r\n%(tb)s' % locals()) + logger.debug( + 'extract_file: ERROR: %(location)r: %(errors)r\n%(e)r\n%(tb)s' % locals()) finally: - yield ExtractEvent(location, target, done=True, warnings=warnings, errors=errors) + yield ExtractEvent( + source=location, + target=target, + done=True, + warnings=warnings, + errors=errors, + ) diff --git a/src/extractcode/libarchive2.py b/src/extractcode/libarchive2.py index 32d97ab..6402e4d 100644 --- a/src/extractcode/libarchive2.py +++ b/src/extractcode/libarchive2.py @@ -45,29 +45,29 @@ logger.setLevel(logging.DEBUG) """ -libarchive2 is a minimal and specialized wrapper around a vendored libarchive archive -extraction library. It only deals with archive extraction and does not know how to -create archives. +libarchive2 is a minimal and specialized wrapper around a vendored libarchive +archive extraction library. It only deals with archive extraction and does not +know how to create archives. -Its main purpose is to try hard to extract files from archives on multiple OSes and -makes some compromises in doing so: +Its main purpose is to try hard to extract files from archives on multiple OSes +and makes some compromises in doing so: - special files and links may be skipped entirely and not extracted at all. -- relative paths are resolved to ensure that files are always extracted under a root - extraction directory. +- relative paths are resolved to ensure that files are always extracted under a + root extraction directory. - files and directories may be renamed if they are not unique (ignoring case) in their extraction directory. -- files and directories are renamed by "transliterating" their names to plain ASCII - if their name contain non-ASCI characters. +- files and directories are renamed by "transliterating" their names to plain + ASCII if their name contain non-ASCI characters. -- files and directories are renamed if they contain characters or names that are not - portable on common OSes (e.g. COM1, ":", "*", etc) +- files and directories are renamed if they contain characters or names that are + not portable on common OSes (e.g. COM1, ":", "*", etc) -- permissions and modes are ignored entirely when extracting files to esnure that - extracted files are always readable. +- permissions and modes are ignored entirely when extracting files to ensure + that extracted files are always readable. It is inspired from several libarchive bindings such as libarchive_c and python-libarchive for Python and other similar wrappers for Ruby such as @@ -227,23 +227,24 @@ class Archive(object): Represent an iterable archive containing a list of Entry objects. Archive is designed to be used as a context manager with the "with" syntax: + with Archive('some.tgz') as archive: for entry in archive: - # dome something with entry + # do something with entry """ def __init__(self, location, uncompress=True, extract=True, block_size=10240): """ Build an Archive object from file at `location`. - If `uncompress` is True, the archive will be uncompressed first if compressed. - (e.g. a tar.gz will be ungzipped). + If `uncompress` is True, the archive will be uncompressed first if + compressed. (e.g. a tar.gz will be ungzipped). - If `extract` is True, the archive will be extracted if this is an archive. - (e.g. a cpio will be extracted). + If `extract` is True, the archive will be extracted if this is an + archive. (e.g. a cpio will be extracted). - If both are True, the archive will be uncompressed then extracted as needed. - (e.g. a tar.xz will be unxzed then untarred at once). + If both are True, the archive will be uncompressed then extracted as + needed. (e.g. a tar.xz will be unxzed then untarred at once). """ msg = 'At least one of `uncompress` or `extract` flag is required.' assert uncompress or extract, msg @@ -256,9 +257,9 @@ def __init__(self, location, uncompress=True, extract=True, block_size=10240): def open(self): """ - Open the archive for reading. - You must call close() when done to free up resources and avoid leaks. - Or use instead the Archive class as a context manager with the "with" keyword. + Open the archive for reading. You must call close() when done to free up + resources and avoid leaks. Or use instead the Archive class as a context + manager with the "with" keyword. """ # first close any existing opened struct for this file self.close() @@ -276,9 +277,9 @@ def open(self): def close(self): """ - Release any memory held by the underlying librachive for this archive. You - must call close() when done with an archive to free up resources and avoid - leaks. + Release any memory held by the underlying librachive for this archive. + You must call close() when done with an archive to free up resources and + avoid leaks. """ if self.archive_struct: free_archive(self.archive_struct) @@ -286,7 +287,7 @@ def close(self): def iter(self): """ - Yield Entry for this archive. + Yield Entry(ies) for this archive. """ assert self.archive_struct, 'Archive must be used as a context manager.' entry_struct = new_entry() @@ -334,7 +335,7 @@ class attributes. Some attributes are not handled on purpose because they by design to ensure extracted files are readable/writable and owned by the extracting user. """ - # TODO: re-check if users and groups may have some value for origin determination? + # TODO: re-check if users/groups may have some value for origin determination? # an archive object archive = attr.ib(repr=False) @@ -416,10 +417,10 @@ def get_path(self, func, func_w): def write(self, target_dir, transform_path=lambda x: x, skip_links=True): """ - Write entry to a file or directory saved relatively to the `target_dir` and - return the path where the file or directory was written or None if nothing - was written to disk. `transform_path` is a callable taking a path and - returning a transformed path such as resolving relative paths, + Write entry to a file or directory saved relatively to the `target_dir` + and return the path where the file or directory was written or None if + nothing was written to disk. `transform_path` is a callable taking a + path and returning a transformed path such as resolving relative paths, transliterating non-portable characters or other path transformations. The default is a no-op lambda. """ @@ -462,7 +463,10 @@ def write(self, target_dir, transform_path=lambda x: x, skip_links=True): # TODO: return some warning when original path has been renamed? unique_path = extractcode.new_name(target_path, is_dir=False) if TRACE: - logger.debug('path: \ntarget_path: {}\nunique_path: {}'.format(target_path, unique_path)) + logger.debug( + f'path: \ntarget_path: {target_path}\n' + f'unique_path: {unique_path}', + ) with open(unique_path, 'wb') as target: for content in self.get_content(): @@ -489,7 +493,13 @@ def get_content(self): class ArchiveException(ExtractError): - def __init__(self, rc=None, archive_struct=None, archive_func=None, root_ex=None): + def __init__( + self, + rc=None, + archive_struct=None, + archive_func=None, + root_ex=None, + ): self.root_ex = root_ex if root_ex and isinstance(root_ex, ArchiveException): self.rc = root_ex.rc @@ -508,8 +518,9 @@ def __init__(self, rc=None, archive_struct=None, archive_func=None, root_ex=None def __str__(self): if TRACE: - msg = (u'%(msg)r: in function %(func)r with rc=%(rc)r, errno=%(errno)r, ' - 'root_ex=%(root_ex)r') + msg = ( + '%(msg)r: in function %(func)r with rc=%(rc)r, ' + 'errno=%(errno)r, root_ex=%(root_ex)r') return msg % self.__dict__ return self.msg or '' @@ -534,7 +545,10 @@ class ArchiveErrorFailedToWriteEntry(ArchiveException): pass -class ArchiveErrorPasswordProtected(ArchiveException, ExtractErrorPasswordProtected): +class ArchiveErrorPasswordProtected( + ArchiveException, + ExtractErrorPasswordProtected, +): pass @@ -548,7 +562,8 @@ class ArchiveErrorIllegalOperationOnClosedArchive(ArchiveException): def errcheck(rc, archive_func, args, null=False): """ - ctypes error check handler for functions returning int, or null if null is True. + ctypes error check handler for functions returning int, or null if null is + True. """ if null: if rc is None: @@ -595,17 +610,19 @@ def errcheck(rc, archive_func, args, null=False): ##################################### # libarchive C functions declarations ##################################### -# NOTE: these declaration come with verbose doc to help with debugging and tracing -# lower level errors and issues. Some comments and the function signatures are -# copied from libarchve. + +# NOTE: these declaration come with verbose doc to help with debugging and +# tracing lower level errors and issues. Some comments and the function +# signatures are copied from libarchve. # -# NOTE: String data in librachive can be set or accessed as wide character strings or -# narrow char strings. The functions that use wide character strings are suffixed -# with _w. These are different representations of the same data: For example, if you -# store a narrow string and read the corresponding wide string, the object will -# transparently convert formats using the current locale. Similarly, if you store a -# wide string and then store a narrow string for the same data, the previously-set -# wide string will be discarded in favor of the new data. +# NOTE: String data in libarchive can be set or accessed as wide character +# strings or narrow char strings. The functions that use wide character strings +# are suffixed with _w. These are different representations of the same data: +# For example, if you store a narrow string and read the corresponding wide +# string, the object will transparently convert formats using the current +# locale. Similarly, if you store a wide string and then store a narrow string +# for the same data, the previously-set wide string will be discarded in favor +# of the new data. """ To read an archive, you must first obtain an initialized struct archive object diff --git a/src/extractcode/sevenzip.py b/src/extractcode/sevenzip.py index 2567b35..be58754 100644 --- a/src/extractcode/sevenzip.py +++ b/src/extractcode/sevenzip.py @@ -148,8 +148,8 @@ def get_7z_errors(stdout, stderr): def get_7z_warnings(stdout): """ - Return a mapping of {path: warning_message} of 7zip warnings extracted from a - `stdout` text. + Return a mapping of {path: warning_message} of 7zip warnings extracted from + a `stdout` text. """ # FIXME: we should use only one pass over stdout for errors and warnings cannot_open = 'can not open output file' @@ -178,6 +178,7 @@ def convert_warnings_to_list(warnings): def list_extracted_7z_files(stdout): """ List all files extracted by 7zip based on the stdout analysis. + Based on 7zip Client7z.cpp: static const char *kExtractingString = "Extracting "; """ @@ -197,37 +198,47 @@ def is_rar(location): return T.filetype_file.lower().startswith('rar archive') -def extract(location, target_dir, arch_type='*', file_by_file=on_mac, skip_symlinks=True): +def extract( + location, + target_dir, + arch_type='*', + file_by_file=on_mac, + skip_symlinks=True, +): """ - Extract all files from a 7zip-supported archive file at location in the - target_dir directory. `skip_symlinks` by default. - Return a list of warning messages. - Raise exception on errors. + Extract all files from a 7zip-supported archive file at ``location`` in the + ``target_dir`` directory. ``skip_symlinks`` by default. + + Return a list of warning messages. Raise exception on errors. - The extraction will either be done all-files-at-once (default on most OSes) - or one-file-at-a-time after collecting a directory listing (for some - problematic OSes such as recent macOS) + ``arch_type`` is the type of 7zip archive passed to the -t 7zip option. Can + be None. - `arch_type` is the type of 7zip archive passed to the -t 7zip option. Can be - None. + Based on ``file_by_file`` the extraction will either be done all-files-at- + once (default on most OSes) or one-file-at-a-time after collecting a + directory listing (for some problematic OSes such as recent macOS) """ assert location abs_location = os.path.abspath(os.path.expanduser(location)) if not os.path.exists(abs_location): raise ExtractErrorFailedToExtract( - 'The system cannot find the path specified: {}'.format(repr(abs_location))) + f'The system cannot find the path specified: {abs_location}') if is_rar(location): raise ExtractErrorFailedToExtract( - 'RAR extraction disactivated: {}'.format(repr(location))) + f'RAR extraction deactivated: {location}') assert target_dir abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) if not os.path.exists(abs_target_dir): raise ExtractErrorFailedToExtract( - 'The system cannot find the target path specified: {}'.format(repr(target_dir))) + f'The system cannot find the target path specified: {target_dir}') + + if file_by_file: + extractor = extract_file_by_file + else: + extractor = extract_all_files_at_once - extractor = extract_file_by_file if file_by_file else extract_all_files_at_once return extractor( location=abs_location, target_dir=abs_target_dir, @@ -236,16 +247,20 @@ def extract(location, target_dir, arch_type='*', file_by_file=on_mac, skip_symli ) -def extract_all_files_at_once(location, target_dir, arch_type='*', skip_symlinks=True): +def extract_all_files_at_once( + location, + target_dir, + arch_type='*', + skip_symlinks=True, +): """ - Extract all files from a 7zip-supported archive file at `location` in the - `target_dir` directory. + Extract all files from a 7zip-supported archive file at ``location`` in the + ``target_dir`` directory. - Return a list of warning messages. - Raise exception on errors. + Return a list of warning messages. Raise exception on errors. - `arch_type` is the type of 7zip archive passed to the -t 7zip option. Can be - None. + ``arch_type`` is the type of 7zip archive passed to the -t 7zip option. Can + be None. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) @@ -270,13 +285,18 @@ def extract_all_files_at_once(location, target_dir, arch_type='*', skip_symlinks return convert_warnings_to_list(get_7z_warnings(stdout)) -def build_7z_extract_command(location, target_dir, single_entry=None, arch_type='*'): +def build_7z_extract_command( + location, + target_dir, + single_entry=None, + arch_type='*', +): """ Return a mapping of 7z command line aguments to extract the archive at - `location` to `target_dir`. + ``location`` to ``target_dir``. - If `single_entry` contains an Entry, provide the command to extract only - that single entry "path" in the current directory without any leading path. + If ``single_entry`` contains an Entry, return the command to extract only + this single entry "path" in the current directory without any leading path. """ # 7z arguments @@ -297,7 +317,8 @@ def build_7z_extract_command(location, target_dir, single_entry=None, arch_type= # pass an empty password so that extraction with passwords WILL fail password = '-p' - # renaming may not behave the same way on all OSes in particular Mac and Windows + # renaming may not behave the same way on all OSes in particular Mac and + # Windows auto_rename_dupe_names = '-aou' # Ensure that we treat the FS as case insensitive if that's what it is @@ -316,13 +337,14 @@ def build_7z_extract_command(location, target_dir, single_entry=None, arch_type= # output_as_utf = '-sccUTF-8' # working_tmp_dir = '-w' - # NB: we force running in the GMT timezone, because 7z is unable to set - # the TZ correctly when the archive does not contain TZ info. This does - # not work on Windows, because 7z is not using the TZ env var there. + # NB: we force running in the GMT timezone, because 7z is unable to set the + # TZ correctly when the archive does not contain TZ info. This does not work + # on Windows, because 7z is not using the TZ env var there. timezone = dict(os.environ) timezone.update({u'TZ': u'GMT'}) timezone = command.get_env(timezone) - # Note: 7z does extract in the current directory so we cwd to the target dir first + # Note: 7z does extract in the current directory so we cwd to the target dir + # first args = [ extract, yes_to_all, @@ -353,15 +375,20 @@ def build_7z_extract_command(location, target_dir, single_entry=None, arch_type= return ex_args -def extract_file_by_file(location, target_dir, arch_type='*', skip_symlinks=True): +def extract_file_by_file( + location, + target_dir, + arch_type='*', + skip_symlinks=True, +): """ Extract all files using a one-by-one process from a 7zip-supported archive - file at location in the `target_dir` directory. + file at ``location`` in the ``target_dir`` directory. Return a list of warning messages if any or an empty list. Raise exception on errors. - `arch_type` is the type of 7zip archive passed to the -t 7zip option. + ``arch_type`` is the type of 7zip archive passed to the -t 7zip option. Can be None. """ abs_location = os.path.abspath(os.path.expanduser(location)) @@ -429,7 +456,8 @@ def extract_file_by_file(location, target_dir, arch_type='*', skip_symlinks=True else: warnings[entry.path] = wmsg - # finally move that extracted file to its target location, possibly renamed + # finally move that extracted file to its target location, possibly + # renamed source_file_name = fileutils.file_name(entry.path) source_file_loc = os.path.join(tmp_extract_dir, source_file_name) if not os.path.exists(source_file_loc): @@ -464,10 +492,10 @@ def extract_file_by_file(location, target_dir, arch_type='*', skip_symlinks=True def list_entries(location, arch_type='*'): """ - Return a tuple of (iterator of Entry, error_messages). The generator contains - each entry found in a 7zip-supported archive file at `location`. Use the - provided 7zip `arch_type` CLI archive type code (e.g. with the "-t* 7z" cli - type option) (can be None). + Return a tuple of (iterator of Entry, error_messages). The generator + contains each entry found in a 7zip-supported archive file at `location`. + Use the provided 7zip `arch_type` CLI archive type code (e.g. with the "-t* + 7z" cli type option) (can be None). """ assert location abs_location = os.path.abspath(os.path.expanduser(location)) @@ -492,9 +520,9 @@ def list_entries(location, arch_type='*'): if on_windows: output_as_utf = '-sccUTF-8' - # NB: we force running in the GMT timezone, because 7z is unable to set - # the TZ correctly when the archive does not contain TZ info. This does - # not work on Windows, because 7z is not using the TZ env var there. + # NB: we force running in the GMT timezone, because 7z is unable to set the + # TZ correctly when the archive does not contain TZ info. This does not work + # on Windows, because 7z is not using the TZ env var there. timezone = dict(os.environ) timezone.update({u'TZ': u'GMT'}) timezone = command.get_env(timezone) @@ -527,48 +555,46 @@ def list_entries(location, arch_type='*'): if rc != 0: error_messages = get_7z_errors(stdout, stderr) or UNKNOWN_ERROR - # the listing was produced as UTF on windows to avoid damaging binary - # paths in console outputs - utf = bool(output_as_utf) + return parse_7z_listing(stdout), error_messages - return parse_7z_listing(stdout, utf), error_messages - -def parse_7z_listing(location, utf=False): +def parse_7z_listing(location): """ Return a list Entry objects from parsing a long format 7zip listing from a file at `location`. - If `utf` is True or if on Python 3, the console output will treated as - utf-8-encoded text. Otherwise it is treated as bytes. - The 7zip -slt format looks like this: 1. a header with: + ----------------- - copyright and version details - '--' line - - archive header info, varying based on the archive types and subtype - - lines of key=value pairs - - ERRORS: followed by one or more message lines - - WARNINGS: followed by one or more message lines + - archive header info, varying based on the archive types and subtype + - lines of key=value pairs + - ERRORS: followed by one or more message lines + - WARNINGS: followed by one or more message lines - blank line 2. blocks of path aka. entry data, one for each path with: + ---------------------------------------------------------- - '----------' line once as the indicator of path blocks starting - for each archive member: - lines of either - - key = value pairs, with a possible twist that the Path may - contain a line return since a filename may. The first key is the Path. - - Errors: followed by one or more message lines - - Warnings: followed by one or more message lines - - Open Warning: : followed by one or more message lines + - key = value pairs, with a possible twist that the Path may + contain a line return since a filename may. The first key is the + Path. + - Errors: followed by one or more message lines + - Warnings: followed by one or more message lines + - Open Warning: : followed by one or more message lines - blank line 3. a footer + ----------- + - blank line - - footer sometimes with lines with summary stats - such as Warnings: 1 Errors: 1 + - footer sometimes with lines with summary stats such as: + Warnings: 1 Errors: 1 - a line with two or more dashes or an empty line We ignore the header and footer in a listing. @@ -602,7 +628,10 @@ def parse_7z_listing(location, utf=False): # then we have a global footer two_empty_lines = '\n\n' path_key = 'Path' - path_blocks = [pb for pb in paths.split(two_empty_lines) if pb and path_key in pb] + path_blocks = [ + pb for pb in paths.split(two_empty_lines) + if pb and path_key in pb + ] key_value_sep = '=' @@ -610,12 +639,19 @@ def parse_7z_listing(location, utf=False): for path_block in path_blocks: # we ignore empty lines as well as lines that do not contain a key - lines = [line.strip() for line in path_block.splitlines(False) if line.strip()] + lines = [ + line.strip() for line in path_block.splitlines(False) + if line.strip() + ] if not lines: continue # we have a weird case of path with line returns in the file name # we concatenate these in the first Path line - while len(lines) > 1 and lines[0].startswith(path_key) and key_value_sep not in lines[1]: + while ( + len(lines) > 1 + and lines[0].startswith(path_key) + and key_value_sep not in lines[1] + ): first_line = lines[0] second_line = lines.pop(1) first_line = '\n'.join([first_line, second_line]) @@ -624,7 +660,10 @@ def parse_7z_listing(location, utf=False): dangling_lines = [line for line in lines if key_value_sep not in line] entry_errors = [] if dangling_lines: - emsg = 'Invalid 7z listing path block missing "=" as key/value separator: {}'.format(repr(path_block)) + emsg = ( + 'Invalid 7z listing path block missing "=" as key/value ' + 'separator: {}'.format(repr(path_block)) + ) entry_errors.append(emsg) entry_attributes = {} @@ -635,7 +674,8 @@ def parse_7z_listing(location, utf=False): v = v.strip() entry_attributes[k] = v - entries.append(Entry.from_dict(infos=entry_attributes, errors=entry_errors)) + ntry = Entry.from_dict(infos=entry_attributes, errors=entry_errors) + entries.append(ntry) if TRACE_ENTRIES: logger.debug('parse_7z_listing: entries# {}\n'.format(len(entries))) @@ -684,7 +724,8 @@ def is_empty(self): @classmethod def from_dict(cls, infos, errors=None): """ - Return an Entry built from a 7zip path listing data in the `infos` mapping. + Return an Entry built from a 7zip path listing data in the `infos` + mapping. """ is_symlink = False is_hardlink = False diff --git a/src/extractcode/uncompress.py b/src/extractcode/uncompress.py index 62da56c..143e336 100644 --- a/src/extractcode/uncompress.py +++ b/src/extractcode/uncompress.py @@ -31,6 +31,7 @@ def uncompress(location, target_dir, decompressor, suffix=EXTRACT_SUFFIX): Uncompress a compressed file at location in the target_dir using the `decompressor` object. The uncompressed file is named after the original archive with a `suffix` added. + Return a list of warning messages. Raise Exceptions on errors. """ # FIXME: do not create a sub-directory and instead strip the "compression" @@ -111,5 +112,11 @@ def get_compressed_file_content(location, decompressor): return content, warnings -get_gz_compressed_file_content = partial(get_compressed_file_content, decompressor=gzip.GzipFile) -get_bz2_compressed_file_content = partial(get_compressed_file_content, decompressor=bz2.BZ2File) +get_gz_compressed_file_content = partial( + get_compressed_file_content, + decompressor=gzip.GzipFile, +) +get_bz2_compressed_file_content = partial( + get_compressed_file_content, + decompressor=bz2.BZ2File, +) diff --git a/src/extractcode/vmimage.py b/src/extractcode/vmimage.py index 24c2bbf..2ce0ea7 100644 --- a/src/extractcode/vmimage.py +++ b/src/extractcode/vmimage.py @@ -23,8 +23,10 @@ """ Support to extract Virtual Machine image formats and the filesystem(s) they -contain. This is based on libguestfs-tools and is tested only on Linux. -Works only if libguestfs tool guestfish is in the path. +contain. This is based on libguestfs-tools and is tested only on Linux. Works +only if libguestfs tool guestfish: +- has its path in the "EXTRACTCODE_GUESTFISH_PATH" environment variable. +- or is in the system PATH. See https://libguestfs.org/ """ @@ -41,7 +43,7 @@ GUESTFISH_NOT_FOUND = ( 'WARNING: guestfish executable is not installed. ' 'Unable to extract virtual machine image: you need to install the ' - 'guestfish tool from libguestfs and extra FS drivers if needed. ' + 'guestfish tool from libguestfs and extra FS drivers as needed. ' 'See the ExtractCode README.rst at ' 'https://github.com/nexB/extractcode/blob/main/README.rst ' 'and https://libguestfs.org/ for details.' @@ -50,8 +52,7 @@ GUESTFISH_KERNEL_NOT_READABLE = ( '''libguestfs requires the kernel executable to be readable. This is the case by default on most Linux distributions except on Ubuntu. -Please follow the instructions in ExtractCode installation guide to make this happen. -See deatils in the ExtractCode README.rst at: +Please follow the ExtractCode installation instructions in the README.rst at: https://github.com/nexB/extractcode/blob/main/README.rst ' ''') @@ -69,6 +70,7 @@ def get_command(env_var=EXTRACTCODE_GUESTFISH_PATH_ENVVAR, command='guestfish'): cmd_loc = shutil.which(command) or None if not cmd_loc: warnings.warn(GUESTFISH_NOT_FOUND) + return cmd_loc @@ -78,9 +80,9 @@ def check_linux_kernel_is_readable(): guestfish and libguestfs and this is an oddity mostly on Ubuntu. See: - - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 - - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 - - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 + - https://bugs.launchpad.net/ubuntu/+source/linux/+bug/759725 + - https://bugzilla.redhat.com/show_bug.cgi?id=1670790 + - https://bugs.launchpad.net/ubuntu/+source/libguestfs/+bug/1813662 """ if on_linux: @@ -90,7 +92,9 @@ def check_linux_kernel_is_readable(): for kern in kernels: if not os.access(kern, os.R_OK): raise ExtractErrorFailedToExtract( - f'Unable to read kernel at: {kern}.\n{GUESTFISH_KERNEL_NOT_READABLE}') + f'Unable to read kernel at: {kern}.\n' + f'{GUESTFISH_KERNEL_NOT_READABLE}' + ) @attr.s @@ -106,7 +110,8 @@ def from_file(cls, location): Raise excptions on errors. """ if not on_linux: - raise ExtractErrorFailedToExtract('VM Image extraction only supported on Linux.') + raise ExtractErrorFailedToExtract( + 'VM Image extraction only supported on Linux.') check_linux_kernel_is_readable() @@ -130,7 +135,8 @@ def from_file(cls, location): image_format = supported_gfs_formats_by_extension.get(extension) if not image_format: - raise ExtractErrorFailedToExtract(f'Unsupported VM image format: {location}') + raise ExtractErrorFailedToExtract( + f'Unsupported VM image format: {location}') cmd_loc = get_command() if not cmd_loc: @@ -144,8 +150,10 @@ def from_file(cls, location): def listfs(self, skip_partitions=('swap',)): """ - Return a list of (filesystem /partition/ device path, filesystem type) for each - filesystem found in this image . + Return a list of (filesystem /partition/ device path, filesystem type) + for each filesystem found in this image. + + Skip the partitions names list ``skip_partitions``. We run guestfish for this: $ guestfish --ro add foo.qcow2 : run : list-filesystems @@ -182,8 +190,8 @@ def listfs(self, skip_partitions=('swap',)): def extract_image(self, target_tarball): """ - Extract all files from this VM image in the `target_tarball` file as a - gzipped-compressed tarball (.tar.gz). Raise exception on errors. + Extract all files from this VM image to the ``target_tarball`` file as a + gzipped-compressed tarball (.tar.gz). Raise Exception on errors. """ args = [ '--ro', @@ -197,8 +205,8 @@ def extract_image(self, target_tarball): def extract_partition(self, partition, target_tarball): """ - Extract all files from a single partition of this VM image to the - `target_tarball` file as a gzipped-compressed tarball (.tar.gz). Raise + Extract all files from a single ``partition`` of this VM image to the + ``target_tarball`` file as a gzipped-compressed tarball (.tar.gz). Raise exception on errors. """ # TODO: there could be devices/partitions we do not want to extract? @@ -216,8 +224,8 @@ def extract_partition(self, partition, target_tarball): def run_guestfish(self, args, timeout=None): """ - Run guestfish with `args` arguments. - Return stdout as unicode string. Raise exception on error + Run guestfish with ``args`` arguments and am optional ``timeout`` in + seconds. Return stdout as a unicode string. Raise Exception on error. """ import subprocess full_args = [self.guestfish_command] + args @@ -242,15 +250,15 @@ def run_guestfish(self, args, timeout=None): def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): """ Extract all files from a guestfish-supported VM image archive file at - location in the target_dir directory. + ``location`` in the ``target_dir`` directory. Return a list of warning messages if any or an empty list. + Raise Exception on errors. - Optionally only extract the intermediate tarballs if `as_tarball` is True. + Optionally only extract the intermediate tarballs if ``as_tarball`` is True. Otherwise, extract to intermediate tarballs and then extract each tarballs to the final directory. - Optionally skip extracting symlinks. - Raise exception on errors. + Optionally skip extracting symlinks with ``skip_symlinks``. This works only on Linux. """ @@ -258,7 +266,8 @@ def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) if not os.path.exists(abs_target_dir) or not os.path.isdir(abs_target_dir): raise ExtractErrorFailedToExtract( - f'The system cannot find the target directory path specified: {target_dir}') + f'The system cannot find the target directory path ' + f'specified: {target_dir}') vmimage = VmImage.from_file(location) @@ -270,7 +279,8 @@ def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): try: if not as_tarballs: - intermediate_dir = fileutils.get_temp_dir(prefix='extractcode-vmimage') + intermediate_dir = fileutils.get_temp_dir( + prefix='extractcode-vmimage') tdir = intermediate_dir else: tdir = target_dir @@ -290,7 +300,8 @@ def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): except ExtractErrorFailedToExtract as e: print('Cannot extract VM Image filesystems as a single file tree.') - warnings.append(f'Cannot extract VM Image filesystems as a single file tree:\n{e}') + warnings.append( + f'Cannot extract VM Image filesystems as a single file tree:\n{e}') # fall back to file system extraction, one partition at a time partitions = vmimage.listfs() if not partitions: @@ -305,7 +316,10 @@ def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): tdir = target_dir target_tarball = os.path.join(tdir, f'{filename}.tar.gz') - vmimage.extract_partition(partition=partition, target_tarball=target_tarball) + vmimage.extract_partition( + partition=partition, + target_tarball=target_tarball, + ) if not as_tarballs: # extract the temp tarball to the final location @@ -327,8 +341,14 @@ def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): else: tdir = target_dir - partition_tarball = os.path.join(tdir, f'{filename}-{base_name}.tar.gz') - vmimage.extract_partition(partition=partition, target_tarball=partition_tarball) + partition_tarball = os.path.join( + tdir, + f'{filename}-{base_name}.tar.gz', + ) + vmimage.extract_partition( + partition=partition, + target_tarball=partition_tarball, + ) if not as_tarballs: # extract the temp tarball to the final location @@ -345,10 +365,11 @@ def extract(location, target_dir, as_tarballs=False, skip_symlinks=True): return warnings -def extract_image_tarball(tarball, target_dir, skip_symlinks=False): +def extract_image_tarball(tarball, target_dir, skip_symlinks=True): """ - Extract an intermediate image tarball to its final directory. - Return a list of warning messages + Extract an intermediate image ``tarball`` to its final ``target_dir`` + directory. Optionally skip extracting symlinks with ``skip_symlinks``. + Return a list of warning messages. Raise Exception on errors. """ from extractcode.libarchive2 import extract return extract( diff --git a/tests/extractcode_assert_utils.py b/tests/extractcode_assert_utils.py index 756ecc3..d930d2e 100644 --- a/tests/extractcode_assert_utils.py +++ b/tests/extractcode_assert_utils.py @@ -104,11 +104,11 @@ def check_no_error(result): def is_posixpath(location): """ - Return True if the `location` path is likely a POSIX-like path using POSIX path - separators (slash or "/")or has no path separator. + Return True if the `location` path is likely a POSIX-like path using POSIX + path separators (slash or "/")or has no path separator. - Return False if the `location` path is likely a Windows-like path using backslash - as path separators (e.g. "\"). + Return False if the `location` path is likely a Windows-like path using + backslash as path separators (e.g. "\"). """ has_slashes = '/' in location has_backslashes = '\\' in location @@ -128,10 +128,11 @@ def is_posixpath(location): def to_posix(path): """ - Return a path using the posix path separator given a path that may contain posix - or windows separators, converting \\ to /. NB: this path will still be valid in - the windows explorer (except as a UNC or share name). It will be a valid path - everywhere in Python. It will not be valid for windows command line operations. + Return a path using the posix path separator given a path that may contain + posix or windows separators, converting \\ to /. NB: this path will still be + valid in the windows explorer (except as a UNC or share name). It will be a + valid path everywhere in Python. It will not be valid for windows command + line operations. """ is_unicode = isinstance(path, str) ntpath_sep = is_unicode and u'\\' or '\\' @@ -181,7 +182,14 @@ def assertRaisesInstance(self, excInstance, callableObj, *args, **kwargs): excName = str(excClass) raise self.failureException('%s not raised' % excName) - def check_extract(self, test_function, test_file, expected, expected_warnings=None, check_all=False): + def check_extract( + self, + test_function, + test_file, + expected, + expected_warnings=None, + check_all=False, + ): """ Run the extraction `test_function` on `test_file` checking that a map of expected paths --> size exist in the extracted target directory. @@ -197,8 +205,14 @@ def check_extract(self, test_function, test_file, expected, expected_warnings=No if check_all: len_test_dir = len(test_dir) - extracted = {path[len_test_dir:]: filetype.get_size(path) for path in fileutils.resource_iter(test_dir, with_dirs=False)} - expected = {os.path.join(test_dir, exp_path): exp_size for exp_path, exp_size in expected.items()} + extracted = { + path[len_test_dir:]: filetype.get_size(path) + for path in fileutils.resource_iter(test_dir, with_dirs=False) + } + expected = { + os.path.join(test_dir, exp_path): exp_size + for exp_path, exp_size in expected.items() + } assert sorted(expected.items()) == sorted(extracted.items()) else: for exp_path, exp_size in expected.items(): @@ -236,9 +250,10 @@ def assertExceptionContains(self, text, callableObj, *args, **kwargs): except Exception as e: if text not in str(e): raise self.failureException( - 'Exception %(e)r raised, ' - 'it should contain the text %(text)r ' - 'and does not' % locals()) + 'Exception %(e)r raised, ' + 'it should contain the text %(text)r ' + 'and does not' % locals() + ) else: raise self.failureException( 'Exception containing %(text)r not raised' % locals()) diff --git a/tests/test_archive.py b/tests/test_archive.py index 3a731ae..3d2eaca 100644 --- a/tests/test_archive.py +++ b/tests/test_archive.py @@ -2212,12 +2212,16 @@ def test_uncompress_lzip_basic(self): class ExtractArchiveWithIllegalFilenamesTestCase(BaseArchiveTestCase): def check_extract_weird_names( - self, test_function, test_file, expected_suffix, - expected_warnings=None, - expected_exception=None, - check_warnings=True, check_only_warnings=False, - regen=False, - ): + self, + test_function, + test_file, + expected_suffix, + expected_warnings=None, + expected_exception=None, + check_warnings=True, + check_only_warnings=False, + regen=False, + ): """ Run the extraction `test_function` on `test_file` checking that the paths listed in the `test_file.excepted` file exist in the extracted target diff --git a/tests/test_extract.py b/tests/test_extract.py index a0a3475..facba67 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -18,7 +18,6 @@ from commoncode.fileutils import as_posixpath from commoncode.system import on_linux from commoncode.system import on_windows -from commoncode.testcase import FileBasedTesting import extractcode from extractcode import extract diff --git a/tests/test_libarchive2.py b/tests/test_libarchive2.py index 6bd7728..0ea6cb3 100644 --- a/tests/test_libarchive2.py +++ b/tests/test_libarchive2.py @@ -36,12 +36,13 @@ def test_libarchive_extract_can_extract_to_relative_paths(self): project_tmp = join(project_root, 'tmp') fileutils.create_dir(project_tmp) project_root_abs = abspath(project_root) - test_src_dir = tempfile.mkdtemp(dir=project_tmp).replace(project_root_abs, '').strip('\\/') - test_tgt_dir = tempfile.mkdtemp(dir=project_tmp).replace(project_root_abs, '').strip('\\/') + test_src_dir = tempfile.mkdtemp( + dir=project_tmp).replace(project_root_abs, '').strip('\\/') + test_tgt_dir = tempfile.mkdtemp( + dir=project_tmp).replace(project_root_abs, '').strip('\\/') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') result = list(extract(test_src_file, test_tgt_dir)) assert [] == result expected = ['c/a/a.txt', 'c/b/a.txt', 'c/c/a.txt'] check_files(test_tgt_dir, expected) - diff --git a/tests/test_sevenzip.py b/tests/test_sevenzip.py index d9ad11a..4a7637e 100644 --- a/tests/test_sevenzip.py +++ b/tests/test_sevenzip.py @@ -24,7 +24,13 @@ class TestSevenZip(FileBasedTesting): test_data_dir = os.path.join(os.path.dirname(__file__), 'data') - def check_results_with_expected_json(self, results, expected_loc, clean_dates=False, regen=False): + def check_results_with_expected_json( + self, + results, + expected_loc, + clean_dates=False, + regen=False, + ): if regen: with open(expected_loc, 'w') as ex: json.dump(results, ex, indent=2, separators=(',', ':')) @@ -170,7 +176,7 @@ class TestSevenParseListing(TestSevenZip): def check_parse_7z_listing(self, test_loc, regen=False): test_loc = self.get_test_loc(test_loc) - results = [e.to_dict(full=True) for e in sevenzip.parse_7z_listing(location=test_loc, utf=True)] + results = [e.to_dict(full=True) for e in sevenzip.parse_7z_listing(location=test_loc)] expected_loc = test_loc + '-expected.json' self.check_results_with_expected_json( results=results, expected_loc=expected_loc, regen=regen)