Skip to content

Commit

Permalink
Add remote file fetcher and N3 test suite
Browse files Browse the repository at this point in the history
This patch adds the N3 test suite from https://github.com/w3c/N3/tree/master/tests
and also adds `test/data/fetcher.py` which fetches remote test data.

Remotes are added for some data in the test data directory, more will be
added later and the data itself will be corrected.

I'm mainly doing this because I want N3 test data to test the fix I'm
making for these issues:
- RDFLib#1807
- RDFLib#1701

Related to:
- RDFLib#1840
  • Loading branch information
aucampia committed Apr 23, 2022
1 parent e4aae60 commit 02dd2bf
Show file tree
Hide file tree
Showing 1,831 changed files with 623,889 additions and 3 deletions.
26 changes: 23 additions & 3 deletions test/data/README.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,29 @@
# Consistent Test Data
# Test Data

This directory contains consistent graphs that can be used inside tests, the
graphs in this directory should not change.
This directory contains data for use inside tests, ideally the data in this
directory should be constant and should not change, and in general non-original
data that is widely known is preferred to original data as well known data has
well known attributes and qualities that can make it easier to reason about.


## File origins

- `rdfs.ttl`: `http://www.w3.org/2000/01/rdf-schema#`

## Fetcher

Files that originate from the internet should be downloaded using `fetcher.py`
so we can easily verify the integrity of the files by re-running `fetcher.py`.

```bash
# run in repo root

# fetch everything
.venv/bin/python3 test/data/fetcher.py

# only fetch single file
.venv/bin/python3 test/data/fetcher.py test/data/rdfs.ttl

# only fetch files below path:
.venv/bin/python3 test/data/fetcher.py test/data/suites
```
309 changes: 309 additions & 0 deletions test/data/fetcher.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,309 @@
#!/usr/bin/env python
import argparse
import enum
import logging
import os
import random
import re
import shutil
import string
import sys
import tarfile
from contextlib import ExitStack, contextmanager
from dataclasses import dataclass, field
from pathlib import Path
from tarfile import TarFile, TarInfo
from tempfile import TemporaryDirectory, mkdtemp
from typing import IO, Generator, List, Pattern, Union
from urllib.request import Request, urlopen
from zipfile import ZipFile, ZipInfo

DATA_PATH = Path(__file__).parent


@dataclass
class Resource:
remote: Union[str, Request]
local_path: Path

def fetch(self, tmp_path: Path) -> None:
raise NotImplementedError()


@dataclass
class FileResource(Resource):
def fetch(self, tmp_path: Path) -> None:
if self.local_path.exists():
logging.debug("info %s", self.local_path)
os.remove(self.local_path)

with ExitStack() as xstack:
request = (
self.remote
if isinstance(self.remote, Request)
else Request(self.remote)
)
response = urlopen(request)
remote_io: IO[bytes] = xstack.enter_context(response)

local_io = xstack.enter_context(self.local_path.open("wb+"))
shutil.copyfileobj(remote_io, local_io)

logging.info("Downloaded %s to %s", request.full_url, self.local_path)


class ArchiveType(enum.Enum):
ZIP = "zip"
TAR_GZ = "tar.gz"


@dataclass
class ArchiveResource(Resource):
type: ArchiveType
pattern: Pattern[str]

def fetch(self, tmp_path: Path) -> None:
if self.local_path.exists():
logging.debug("info %s", self.local_path)
shutil.rmtree(self.local_path)
with ExitStack() as xstack:
request = (
self.remote
if isinstance(self.remote, Request)
else Request(self.remote)
)
response = urlopen(request)
remote_io: IO[bytes] = xstack.enter_context(response)
name = (
"".join(
random.choices(
string.ascii_uppercase + string.digits + string.ascii_lowercase,
k=10,
)
)
+ f".{self.type.value}"
)
tmp_file = tmp_path / name
logging.info("fetching %s to temp file %s", self.remote, tmp_file)
with tmp_file.open("wb+") as tmp_io:
shutil.copyfileobj(remote_io, tmp_io)

archive_file: Union[ZipFile, TarFile]
if self.type is ArchiveType.ZIP:
archive_file = xstack.enter_context(ZipFile(tmp_file))
elif self.type is ArchiveType.TAR_GZ:
archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
# archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
else:
raise ValueError(f"invalid type {self.type}")

for member_info in self._member_list(archive_file):
member_filename = self._member_filename(member_info)
if self._member_isdir(member_info):
logging.debug("Ignoring directory %s", member_filename)
continue

match = self.pattern.match(member_filename)
if match is None:
logging.debug("Ignoring unmatched %s", member_filename)
continue
groups = match.groups()
if len(groups) > 0:
dest_filename = groups[0]

member_io: IO[bytes]
with self._member_io(archive_file, member_info) as member_io:
local_file = self.local_path / dest_filename
if not local_file.parent.exists():
local_file.parent.mkdir(parents=True)
logging.debug("writing %s to %s", member_filename, local_file)
local_file.write_bytes(member_io.read())

logging.info(
"Downloaded %s and extracted files matching %s to %s",
request.full_url,
self.pattern,
self.local_path,
)

@classmethod
def _member_list(
cls, archive: Union[ZipFile, TarFile]
) -> Union[List[ZipInfo], List[TarInfo]]:
if isinstance(archive, ZipFile):
return archive.infolist()
return archive.getmembers()

@classmethod
def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
if isinstance(member_info, ZipInfo):
return member_info.is_dir()
return member_info.isdir()

@classmethod
def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
if isinstance(member_info, ZipInfo):
return member_info.filename
return member_info.name

@classmethod
@contextmanager
def _member_io(
cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
) -> Generator[IO[bytes], None, None]:
if isinstance(archive, ZipFile):
assert isinstance(member_info, ZipInfo)
with archive.open(member_info) as member_io:
yield member_io
else:
assert isinstance(member_info, TarInfo)
opt_io = archive.extractfile(member_info)
assert opt_io is not None
yield opt_io


RESOURCES: List[Resource] = [
ArchiveResource(
remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
type=ArchiveType.ZIP,
pattern=re.compile(r"^[^\/]+[\/]tests[\/](.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^[^\/]+[\/](.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^(.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^(.+)$"),
),
ArchiveResource(
remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
type=ArchiveType.TAR_GZ,
pattern=re.compile(r"^(.+)$"),
),
# NOTE: Commented out as these files contains local modifications.
# ArchiveResource(
# remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
# local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
# type=ArchiveType.ZIP,
# pattern=re.compile(r"^(.+)$"),
# ),
# NOTE: Commented out as this contains local modifications.
# ArchiveResource(
# remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
# local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
# type=ArchiveType.TAR_GZ,
# pattern=re.compile(r"^[^\/]+[\/](.+)$"),
# ),
FileResource(
remote=Request(
"http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
),
local_path=(DATA_PATH / "rdfs.ttl"),
),
]


@dataclass
class Application:
parser: argparse.ArgumentParser = field(
default_factory=lambda: argparse.ArgumentParser(add_help=True)
)

def __post_init__(self) -> None:
parser = self.parser
parser.add_argument(
"-v",
"--verbose",
action="count",
dest="verbosity",
help="increase verbosity level",
)
parser.add_argument(
"--keep-tmp",
action="store_true",
default=False,
)
parser.add_argument("paths", nargs="*", type=str)
parser.set_defaults(handler=self.handle)

def run(self, args: List[str]) -> None:
parse_result = self.parser.parse_args(args)

verbosity = parse_result.verbosity
if verbosity is not None:
root_logger = logging.getLogger("")
root_logger.propagate = True
new_level = (
root_logger.getEffectiveLevel()
- (min(1, verbosity)) * 10
- min(max(0, verbosity - 1), 9) * 1
)
root_logger.setLevel(new_level)

logging.debug(
"args = %s, parse_result = %s, logging.level = %s",
args,
parse_result,
logging.getLogger("").getEffectiveLevel(),
)

parse_result.handler(parse_result)

def handle(self, parse_result: argparse.Namespace) -> None:
logging.debug("entry ...")

paths = {Path(path).absolute() for path in parse_result.paths}

logging.debug("paths = %s", paths)

if parse_result.keep_tmp:
tmp_path = Path(mkdtemp())
else:
tmp_dir = TemporaryDirectory()
tmp_path = Path(tmp_dir.name)

for resource in RESOURCES:
if paths:
include = False
for path in paths:
try:
resource.local_path.absolute().relative_to(path)
include = True
except ValueError:
# not relative to, ignoring
pass
if not include:
logging.info("skipping %s", resource.local_path)
continue
resource.fetch(tmp_path)


def main() -> None:
logging.basicConfig(
level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
stream=sys.stderr,
datefmt="%Y-%m-%dT%H:%M:%S",
format=(
"%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
"%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
),
)

Application().run(sys.argv[1:])


if __name__ == "__main__":
main()
4 changes: 4 additions & 0 deletions test/data/suites/w3c/n3/LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
All documents in this Repository are licensed by contributors
under both the the [W3C Test Suite License](http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) and
[W3C Software and Document License](https://www.w3.org/Consortium/Legal/copyright-software).

Loading

0 comments on commit 02dd2bf

Please sign in to comment.