Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Separate Package parsing functions #3135

Merged
merged 5 commits into from
Oct 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,18 @@ License detection:
removed. This new command supports simpler reindexing using custom
license texts and license rules contributed by plugins or stored in an
additional directory.

Package detection:
~~~~~~~~~~~~~~~~~~~~~

- Code for parsing a Maven POM, npm package.json, freebsd manifest and haxelib
JSON have been separated into two functions: one that creates a PackageData
object from the parsed Resource, and another that calls the previous function
and yields the PackageData. This was done such that we can use the package
manifest data parsing code outside of the scancode-toolkit context in other
libraries.


v31.2.1 - 2022-10-05
----------------------------------

Expand Down
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = scancode-toolkit
version = 31.2.1
version = 31.2.2
license = Apache-2.0 AND CC-BY-4.0 AND LicenseRef-scancode-other-permissive AND LicenseRef-scancode-other-copyleft

# description must be on ONE line https://github.com/pypa/setuptools/issues/1390
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/chef.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def chef_api_url(name, version):
return name and version and f'https://supermarket.chef.io/api/v1/cookbooks/{name}/versions/{version}'


def get_urls(name, version):
def get_urls(name, version, **kwargs):
"""
Return a mapping of URLs given a name and version.
"""
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/cocoapods.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,7 +434,7 @@ def parse(cls, location):
)


def get_urls(name=None, version=None, homepage_url=None, vcs_url=None):
def get_urls(name=None, version=None, homepage_url=None, vcs_url=None, **kwargs):
"""
Return a mapping of podspec URLS.
"""
Expand Down
32 changes: 18 additions & 14 deletions src/packagedcode/freebsd.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,20 +42,13 @@ class CompactManifestHandler(models.DatafileHandler):
documentation_url = 'https://www.freebsd.org/cgi/man.cgi?pkg-create(8)#MANIFEST_FILE_DETAILS'

@classmethod
def parse(cls, location):
"""
Yield one or more Package manifest objects given a file ``location`` pointing to a
package archive, manifest or similar.
"""
with io.open(location, encoding='utf-8') as loc:
freebsd_manifest = saneyaml.load(loc)

def _parse(cls, yaml_data):
package_data = models.PackageData(
datasource_id=cls.datasource_id,
type=cls.default_package_type,
qualifiers=dict(
arch=freebsd_manifest.get('arch'),
origin=freebsd_manifest.get('origin'),
arch=yaml_data.get('arch'),
origin=yaml_data.get('origin'),
)
)

Expand All @@ -69,7 +62,7 @@ def parse(cls, location):
]

for source, target in plain_fields:
value = freebsd_manifest.get(source)
value = yaml_data.get(source)
if value:
if isinstance(value, str):
value = value.strip()
Expand All @@ -87,17 +80,28 @@ def parse(cls, location):

for source, func in field_mappers:
logger.debug('parse: %(source)r, %(func)r' % locals())
value = freebsd_manifest.get(source) or None
value = yaml_data.get(source) or None
if value:
func(value, package_data)

# license_mapper needs multiple fields
license_mapper(freebsd_manifest, package_data)
license_mapper(yaml_data, package_data)

if package_data.declared_license:
package_data.license_expression = cls.compute_normalized_license(package_data)

yield package_data
return package_data

@classmethod
def parse(cls, location):
"""
Yield one or more Package manifest objects given a file ``location`` pointing to a
package archive, manifest or similar.
"""
with io.open(location, encoding='utf-8') as loc:
yaml_data = saneyaml.load(loc)

yield cls._parse(yaml_data)

@classmethod
def compute_normalized_license(cls, package):
Expand Down
46 changes: 25 additions & 21 deletions src/packagedcode/haxe.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,26 +45,7 @@ class HaxelibJsonHandler(models.DatafileHandler):
documentation_url = 'https://lib.haxe.org/documentation/creating-a-haxelib-package/'

@classmethod
def parse(cls, location):
"""
Yield one or more Package manifest objects given a file ``location`` pointing to a
package_data archive, manifest or similar.

{
"name": "haxelib",
"url" : "https://lib.haxe.org/documentation/",
"license": "GPL",
"tags": ["haxelib", "core"],
"description": "The haxelib client",
"classPath": "src",
"version": "3.4.0",
"releasenote": " * Fix password input issue in Windows (#421).\n * ....",
"contributors": ["back2dos", "ncannasse", "jason", "Simn", "nadako", "andyli"]
}
"""
with io.open(location, encoding='utf-8') as loc:
json_data = json.load(loc)

def _parse(cls, json_data):
name = json_data.get('name')
version = json_data.get('version')

Expand Down Expand Up @@ -110,4 +91,27 @@ def parse(cls, location):
dep = models.DependentPackage(purl=dep_purl, is_resolved=is_resolved,)
package_data.dependencies.append(dep)

yield package_data
return package_data

@classmethod
def parse(cls, location):
"""
Yield one or more Package manifest objects given a file ``location`` pointing to a
package_data archive, manifest or similar.

{
"name": "haxelib",
"url" : "https://lib.haxe.org/documentation/",
"license": "GPL",
"tags": ["haxelib", "core"],
"description": "The haxelib client",
"classPath": "src",
"version": "3.4.0",
"releasenote": " * Fix password input issue in Windows (#421).\n * ....",
"contributors": ["back2dos", "ncannasse", "jason", "Simn", "nadako", "andyli"]
}
"""
with io.open(location, encoding='utf-8') as loc:
json_data = json.load(loc)

yield cls._parse(json_data)
34 changes: 29 additions & 5 deletions src/packagedcode/maven.py
Original file line number Diff line number Diff line change
Expand Up @@ -891,12 +891,12 @@ def has_basic_pom_attributes(pom):
return basics


def get_maven_pom(location=None):
def get_maven_pom(location=None, text=None):
"""
Return a MavenPom object from a POM file at `location` or provided as a
`text` string.
"""
pom = MavenPom(location=location)
pom = MavenPom(location=location, text=text)

extra_properties = {}

Expand Down Expand Up @@ -1043,7 +1043,7 @@ def get_parties(pom):
return parties


def get_urls(namespace, name, version, qualifiers, base_url='https://repo1.maven.org/maven2'):
def get_urls(namespace, name, version, qualifiers, base_url='https://repo1.maven.org/maven2', **kwargs):
"""
Return a mapping of URLs.
"""
Expand Down Expand Up @@ -1105,7 +1105,30 @@ def parse(
Yield Packagedata objects from parsing a Maven pom file at `location` or
using the provided `text` (one or the other but not both).
"""
pom = get_maven_pom(location=location)
package = _parse(
datasource_id=datasource_id,
package_type=package_type,
primary_language=primary_language,
location=location,
base_url=base_url
)
if package:
yield package


def _parse(
datasource_id,
package_type,
primary_language,
location=None,
text=None,
base_url='https://repo1.maven.org/maven2',
):
"""
Yield Packagedata objects from parsing a Maven pom file at `location` or
using the provided `text` (one or the other but not both).
"""
pom = get_maven_pom(location=location, text=text)

if not pom:
return
Expand Down Expand Up @@ -1192,7 +1215,8 @@ def parse(
if not package_data.license_expression and package_data.declared_license:
package_data.license_expression = models.compute_normalized_license(package_data.declared_license)

yield package_data
return package_data


def build_vcs_and_code_view_urls(scm):
"""
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def set_purl(self, package_url):
for key, value in package_url.to_dict().items():
self_val = getattr(self, key)
if not self_val and value:
setattr(self, attr, value)
setattr(self, key, value)

def to_dict(self, **kwargs):
mapping = super().to_dict(**kwargs)
Expand Down
32 changes: 18 additions & 14 deletions src/packagedcode/npm.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def walk_npm(cls, resource, codebase, depth=0):
yield subchild


def get_urls(namespace, name, version):
def get_urls(namespace, name, version, **kwargs):
return dict(
repository_homepage_url=npm_homepage_url(namespace, name, registry='https://www.npmjs.com/package'),
repository_download_url=npm_download_url(namespace, name, version, registry='https://registry.npmjs.org'),
Expand All @@ -163,13 +163,10 @@ class NpmPackageJsonHandler(BaseNpmHandler):
documentation_url = 'https://docs.npmjs.com/cli/v8/configuring-npm/package-json'

@classmethod
def parse(cls, location):
with io.open(location, encoding='utf-8') as loc:
package_data = json.load(loc)

name = package_data.get('name')
version = package_data.get('version')
homepage_url = package_data.get('homepage', '')
def _parse(cls, json_data):
name = json_data.get('name')
version = json_data.get('version')
homepage_url = json_data.get('homepage', '')

# a package.json without name and version can be a private package

Expand All @@ -188,11 +185,11 @@ def parse(cls, location):
namespace=namespace or None,
name=name,
version=version or None,
description=package_data.get('description', '').strip() or None,
description=json_data.get('description', '').strip() or None,
homepage_url=homepage_url,
**urls,
)
vcs_revision = package_data.get('gitHead') or None
vcs_revision = json_data.get('gitHead') or None

# mapping of top level package.json items to a function accepting as
# arguments the package.json element value and returning an iterable of (key,
Expand All @@ -214,7 +211,7 @@ def parse(cls, location):
]

for source, func in field_mappers:
value = package_data.get(source) or None
value = json_data.get(source) or None
if value:
if isinstance(value, str):
value = value.strip()
Expand All @@ -226,14 +223,21 @@ def parse(cls, location):
package.download_url = npm_download_url(package.namespace, package.name, package.version)

# licenses are a tad special with many different data structures
lic = package_data.get('license')
lics = package_data.get('licenses')
lic = json_data.get('license')
lics = json_data.get('licenses')
package = licenses_mapper(lic, lics, package)

if not package.license_expression and package.declared_license:
package.license_expression = compute_normalized_license(package.declared_license)

yield package
return package

@classmethod
def parse(cls, location):
with io.open(location, encoding='utf-8') as loc:
json_data = json.load(loc)

yield cls._parse(json_data)

@classmethod
def compute_normalized_license(cls, package):
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/nuget.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# TODO: add dependencies


def get_urls(name, version):
def get_urls(name, version, **kwargs):
return dict(
repository_homepage_url=f'https://www.nuget.org/packages/{name}/{version}',
repository_download_url=f'https://www.nuget.org/api/v2/package/{name}/{version}',
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1609,7 +1609,7 @@ def get_setup_py_args(location, include_not_parsable=False):
return parse_setup_py(location)


def get_pypi_urls(name, version):
def get_pypi_urls(name, version, **kwargs):
"""
Return a mapping of computed Pypi URLs for this package
"""
Expand Down
2 changes: 1 addition & 1 deletion src/packagedcode/rubygems.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ def compute_normalized_license(declared_license):
return combine_expressions(detected_licenses)


def get_urls(name, version=None, platform=None):
def get_urls(name, version=None, platform=None, **kwargs):
"""
Return a mapping of standard URLs
"""
Expand Down