Add remote file fetcher and N3 test suite

This patch adds the N3 test suite from https://github.com/w3c/N3/tree/master/tests and also adds `test/data/fetcher.py` which fetches remote test data. Remotes are added for some data in the test data directory, more will be added later and the data itself will be corrected. I'm mainly doing this because I want N3 test data to test the fix I'm making for these issues: - RDFLib#1807 - RDFLib#1701 Related to: - RDFLib#1840
aucampia · Apr 23, 2022 · 02dd2bf · 02dd2bf
1 parent e4aae60
commit 02dd2bf
Show file tree

Hide file tree

Showing 1,831 changed files with 623,889 additions and 3 deletions.
diff --git a/test/data/README.md b/test/data/README.md
@@ -1,9 +1,29 @@
-# Consistent Test Data
+# Test Data
 
-This directory contains consistent graphs that can be used inside tests, the
-graphs in this directory should not change.
+This directory contains data for use inside tests, ideally the data in this
+directory should be constant and should not change, and in general non-original
+data that is widely known is preferred to original data as well known data has
+well known attributes and qualities that can make it easier to reason about.
 
 
 ## File origins
 
 - `rdfs.ttl`: `http://www.w3.org/2000/01/rdf-schema#`
+
+## Fetcher
+
+Files that originate from the internet should be downloaded using `fetcher.py`
+so we can easily verify the integrity of the files by re-running `fetcher.py`.
+
+```bash
+# run in repo root
+
+# fetch everything
+.venv/bin/python3 test/data/fetcher.py
+
+# only fetch single file
+.venv/bin/python3 test/data/fetcher.py test/data/rdfs.ttl
+
+# only fetch files below path:
+.venv/bin/python3 test/data/fetcher.py test/data/suites
+```
diff --git a/test/data/fetcher.py b/test/data/fetcher.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python
+import argparse
+import enum
+import logging
+import os
+import random
+import re
+import shutil
+import string
+import sys
+import tarfile
+from contextlib import ExitStack, contextmanager
+from dataclasses import dataclass, field
+from pathlib import Path
+from tarfile import TarFile, TarInfo
+from tempfile import TemporaryDirectory, mkdtemp
+from typing import IO, Generator, List, Pattern, Union
+from urllib.request import Request, urlopen
+from zipfile import ZipFile, ZipInfo
+
+DATA_PATH = Path(__file__).parent
+
+
+@dataclass
+class Resource:
+    remote: Union[str, Request]
+    local_path: Path
+
+    def fetch(self, tmp_path: Path) -> None:
+        raise NotImplementedError()
+
+
+@dataclass
+class FileResource(Resource):
+    def fetch(self, tmp_path: Path) -> None:
+        if self.local_path.exists():
+            logging.debug("info %s", self.local_path)
+            os.remove(self.local_path)
+
+        with ExitStack() as xstack:
+            request = (
+                self.remote
+                if isinstance(self.remote, Request)
+                else Request(self.remote)
+            )
+            response = urlopen(request)
+            remote_io: IO[bytes] = xstack.enter_context(response)
+
+            local_io = xstack.enter_context(self.local_path.open("wb+"))
+            shutil.copyfileobj(remote_io, local_io)
+
+        logging.info("Downloaded %s to %s", request.full_url, self.local_path)
+
+
+class ArchiveType(enum.Enum):
+    ZIP = "zip"
+    TAR_GZ = "tar.gz"
+
+
+@dataclass
+class ArchiveResource(Resource):
+    type: ArchiveType
+    pattern: Pattern[str]
+
+    def fetch(self, tmp_path: Path) -> None:
+        if self.local_path.exists():
+            logging.debug("info %s", self.local_path)
+            shutil.rmtree(self.local_path)
+        with ExitStack() as xstack:
+            request = (
+                self.remote
+                if isinstance(self.remote, Request)
+                else Request(self.remote)
+            )
+            response = urlopen(request)
+            remote_io: IO[bytes] = xstack.enter_context(response)
+            name = (
+                "".join(
+                    random.choices(
+                        string.ascii_uppercase + string.digits + string.ascii_lowercase,
+                        k=10,
+                    )
+                )
+                + f".{self.type.value}"
+            )
+            tmp_file = tmp_path / name
+            logging.info("fetching %s to temp file %s", self.remote, tmp_file)
+            with tmp_file.open("wb+") as tmp_io:
+                shutil.copyfileobj(remote_io, tmp_io)
+
+            archive_file: Union[ZipFile, TarFile]
+            if self.type is ArchiveType.ZIP:
+                archive_file = xstack.enter_context(ZipFile(tmp_file))
+            elif self.type is ArchiveType.TAR_GZ:
+                archive_file = xstack.enter_context(tarfile.open(tmp_file, mode="r:gz"))
+                # archive_file = xstack.enter_context(TarFile(tmp_file, mode="r|gz"))
+            else:
+                raise ValueError(f"invalid type {self.type}")
+
+            for member_info in self._member_list(archive_file):
+                member_filename = self._member_filename(member_info)
+                if self._member_isdir(member_info):
+                    logging.debug("Ignoring directory %s", member_filename)
+                    continue
+
+                match = self.pattern.match(member_filename)
+                if match is None:
+                    logging.debug("Ignoring unmatched %s", member_filename)
+                    continue
+                groups = match.groups()
+                if len(groups) > 0:
+                    dest_filename = groups[0]
+
+                member_io: IO[bytes]
+                with self._member_io(archive_file, member_info) as member_io:
+                    local_file = self.local_path / dest_filename
+                    if not local_file.parent.exists():
+                        local_file.parent.mkdir(parents=True)
+                    logging.debug("writing %s to %s", member_filename, local_file)
+                    local_file.write_bytes(member_io.read())
+
+        logging.info(
+            "Downloaded %s and extracted files matching %s to %s",
+            request.full_url,
+            self.pattern,
+            self.local_path,
+        )
+
+    @classmethod
+    def _member_list(
+        cls, archive: Union[ZipFile, TarFile]
+    ) -> Union[List[ZipInfo], List[TarInfo]]:
+        if isinstance(archive, ZipFile):
+            return archive.infolist()
+        return archive.getmembers()
+
+    @classmethod
+    def _member_isdir(cls, member_info: Union[ZipInfo, TarInfo]) -> bool:
+        if isinstance(member_info, ZipInfo):
+            return member_info.is_dir()
+        return member_info.isdir()
+
+    @classmethod
+    def _member_filename(cls, member_info: Union[ZipInfo, TarInfo]) -> str:
+        if isinstance(member_info, ZipInfo):
+            return member_info.filename
+        return member_info.name
+
+    @classmethod
+    @contextmanager
+    def _member_io(
+        cls, archive: Union[ZipFile, TarFile], member_info: Union[ZipInfo, TarInfo]
+    ) -> Generator[IO[bytes], None, None]:
+        if isinstance(archive, ZipFile):
+            assert isinstance(member_info, ZipInfo)
+            with archive.open(member_info) as member_io:
+                yield member_io
+        else:
+            assert isinstance(member_info, TarInfo)
+            opt_io = archive.extractfile(member_info)
+            assert opt_io is not None
+            yield opt_io
+
+
+RESOURCES: List[Resource] = [
+    ArchiveResource(
+        remote="https://github.com/w3c/N3/archive/c44d123c5958ca04117e28ca3769e2c0820f72e6.zip",
+        local_path=(DATA_PATH / "suites" / "w3c" / "n3"),
+        type=ArchiveType.ZIP,
+        pattern=re.compile(r"^[^\/]+[\/]tests[\/](.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/TurtleTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "turtle"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^[^\/]+[\/](.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/N-QuadsTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "nquads"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/N-TriplesTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "ntriples"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    ArchiveResource(
+        remote="https://www.w3.org/2013/TrigTests/TESTS.tar.gz",
+        local_path=(DATA_PATH / "suites" / "w3c" / "trig"),
+        type=ArchiveType.TAR_GZ,
+        pattern=re.compile(r"^(.+)$"),
+    ),
+    # NOTE: Commented out as these files contains local modifications.
+    # ArchiveResource(
+    #     remote="https://www.w3.org/2013/RDFXMLTests/TESTS.zip",
+    #     local_path=(DATA_PATH / "suites" / "w3c" / "rdfxml"),
+    #     type=ArchiveType.ZIP,
+    #     pattern=re.compile(r"^(.+)$"),
+    # ),
+    # NOTE: Commented out as this contains local modifications.
+    # ArchiveResource(
+    #     remote="https://www.w3.org/2009/sparql/docs/tests/sparql11-test-suite-20121023.tar.gz",
+    #     local_path=(DATA_PATH / "suites" / "DAWG" / "data-sparql11"),
+    #     type=ArchiveType.TAR_GZ,
+    #     pattern=re.compile(r"^[^\/]+[\/](.+)$"),
+    # ),
+    FileResource(
+        remote=Request(
+            "http://www.w3.org/2000/01/rdf-schema#", headers={"Accept": "text/turtle"}
+        ),
+        local_path=(DATA_PATH / "rdfs.ttl"),
+    ),
+]
+
+
+@dataclass
+class Application:
+    parser: argparse.ArgumentParser = field(
+        default_factory=lambda: argparse.ArgumentParser(add_help=True)
+    )
+
+    def __post_init__(self) -> None:
+        parser = self.parser
+        parser.add_argument(
+            "-v",
+            "--verbose",
+            action="count",
+            dest="verbosity",
+            help="increase verbosity level",
+        )
+        parser.add_argument(
+            "--keep-tmp",
+            action="store_true",
+            default=False,
+        )
+        parser.add_argument("paths", nargs="*", type=str)
+        parser.set_defaults(handler=self.handle)
+
+    def run(self, args: List[str]) -> None:
+        parse_result = self.parser.parse_args(args)
+
+        verbosity = parse_result.verbosity
+        if verbosity is not None:
+            root_logger = logging.getLogger("")
+            root_logger.propagate = True
+            new_level = (
+                root_logger.getEffectiveLevel()
+                - (min(1, verbosity)) * 10
+                - min(max(0, verbosity - 1), 9) * 1
+            )
+            root_logger.setLevel(new_level)
+
+        logging.debug(
+            "args = %s, parse_result = %s, logging.level = %s",
+            args,
+            parse_result,
+            logging.getLogger("").getEffectiveLevel(),
+        )
+
+        parse_result.handler(parse_result)
+
+    def handle(self, parse_result: argparse.Namespace) -> None:
+        logging.debug("entry ...")
+
+        paths = {Path(path).absolute() for path in parse_result.paths}
+
+        logging.debug("paths = %s", paths)
+
+        if parse_result.keep_tmp:
+            tmp_path = Path(mkdtemp())
+        else:
+            tmp_dir = TemporaryDirectory()
+            tmp_path = Path(tmp_dir.name)
+
+        for resource in RESOURCES:
+            if paths:
+                include = False
+                for path in paths:
+                    try:
+                        resource.local_path.absolute().relative_to(path)
+                        include = True
+                    except ValueError:
+                        # not relative to, ignoring
+                        pass
+                if not include:
+                    logging.info("skipping %s", resource.local_path)
+                    continue
+            resource.fetch(tmp_path)
+
+
+def main() -> None:
+    logging.basicConfig(
+        level=os.environ.get("PYLOGGING_LEVEL", logging.INFO),
+        stream=sys.stderr,
+        datefmt="%Y-%m-%dT%H:%M:%S",
+        format=(
+            "%(asctime)s.%(msecs)03d %(process)d %(thread)d %(levelno)03d:%(levelname)-8s "
+            "%(name)-12s %(module)s:%(lineno)s:%(funcName)s %(message)s"
+        ),
+    )
+
+    Application().run(sys.argv[1:])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/data/suites/w3c/n3/LICENSE.md b/test/data/suites/w3c/n3/LICENSE.md
@@ -0,0 +1,4 @@
+All documents in this Repository are licensed by contributors
+under both the the [W3C Test Suite License](http://www.w3.org/Consortium/Legal/2008/04-testsuite-license) and 
+[W3C Software and Document License](https://www.w3.org/Consortium/Legal/copyright-software).
+