Materials-Consortia · ml-evs · May 21, 2022 · May 21, 2022 · May 21, 2022 · May 21, 2022
diff --git a/optimade/validator/__init__.py b/optimade/validator/__init__.py
@@ -1,5 +1,6 @@
 """ This module contains the ImplementationValidator class and corresponding command line tools. """
 # pylint: disable=import-outside-toplevel
+import warnings
 from optimade import __version__, __api_version__
 from .validator import ImplementationValidator
 from .utils import DEFAULT_CONN_TIMEOUT, DEFAULT_READ_TIMEOUT
@@ -89,7 +90,7 @@ def validate():  # pragma: no cover
     parser.add_argument(
         "--page_limit",
         type=int,
-        default=5,
+        default=None,
         help="Alter the requested page limit for some tests.",
     )
 
@@ -140,6 +141,11 @@ def validate():  # pragma: no cover
     if args["as_type"] is not None and args["as_type"] not in valid_types:
         sys.exit(f"{args['as_type']} is not a valid type, must be one of {valid_types}")
 
+    if args["page_limit"] is not None:
+        warnings.warn(
+            "The `--page_limit` flag is now deprecated and will not be used by the validator."
+        )
+
     validator = ImplementationValidator(
         base_url=args["base_url"],
         verbosity=args["verbosity"],
@@ -149,7 +155,6 @@ def validate():  # pragma: no cover
         run_optional_tests=not args["skip_optional"],
         fail_fast=args["fail_fast"],
         minimal=args["minimal"],
-        page_limit=args["page_limit"],
         http_headers=args["headers"],
         timeout=args["timeout"],
         read_timeout=args["read_timeout"],

diff --git a/optimade/validator/validator.py b/optimade/validator/validator.py
@@ -12,7 +12,7 @@ class that can be pointed at an OPTIMADE implementation and validated
 import random
 import urllib.parse
 import dataclasses
-from typing import Union, Tuple, Any, List, Dict, Optional
+from typing import Union, Tuple, Any, List, Dict, Optional, Set
 
 try:
     import simplejson as json
@@ -70,7 +70,7 @@ def __init__(  # pylint: disable=too-many-arguments
         base_url: str = None,
         verbosity: int = 0,
         respond_json: bool = False,
-        page_limit: int = 5,
+        page_limit: int = 4,
         max_retries: int = 5,
         run_optional_tests: bool = True,
         fail_fast: bool = False,
@@ -380,6 +380,7 @@ def _recurse_through_endpoint(self, endp: str) -> Tuple[bool, str]:
         prop_list = list(_impl_properties.keys())
 
         self._check_response_fields(endp, prop_list)
+
         chosen_entry, _ = self._get_archetypal_entry(endp, prop_list)
 
         if not chosen_entry:
@@ -524,19 +525,16 @@ def _get_archetypal_entry(
             if data_returned < 1:
                 return (
                     None,
-                    "Endpoint {endp!r} returned no entries, cannot get archetypal entry.",
+                    "Endpoint {endp!r} returned no entries, cannot get archetypal entry or test filtering.",
                 )
 
-            response, message = self._get_endpoint(
-                f"{endp}?page_offset={random.randint(0, data_returned-1)}&response_fields={','.join(properties)}",
-                multistage=True,
+            archetypal_entry = response.json()["data"][
+                random.randint(0, data_returned - 1)
+            ]
+            return (
+                archetypal_entry,
+                f"set archetypal entry for {endp} with ID {archetypal_entry['id']}.",
             )
-            if response:
-                archetypal_entry = response.json()["data"][0]
-                return (
-                    archetypal_entry,
-                    f"set archetypal entry for {endp} with ID {archetypal_entry['id']}.",
-                )
 
         raise ResponseError(f"Failed to get archetypal entry. Details: {message}")
 
@@ -557,7 +555,7 @@ def _check_response_fields(self, endp: str, fields: List[str]) -> Tuple[bool, st
         test_query = f"{endp}?response_fields={','.join(subset_fields)}&page_limit=1"
         response, _ = self._get_endpoint(test_query, multistage=True)
 
-        if response and len(response.json()["data"]) == 1:
+        if response and len(response.json()["data"]) >= 0:
             doc = response.json()["data"][0]
             expected_fields = set(subset_fields)
             expected_fields -= CONF.top_level_non_attribute_fields
@@ -825,8 +823,48 @@ def _construct_single_property_filters(
             else:
                 num_data_returned[operator] = response["meta"].get("data_returned")
 
+            if prop in CONF.unique_properties and operator == "=":
+                if num_data_returned["="] is not None and num_data_returned["="] == 0:
+                    raise ResponseError(
+                        f"Unable to filter field 'id' for equality, no data was returned for {query}."
+                    )
+                if num_data_returned["="] is not None and num_data_returned["="] > 1:
+                    raise ResponseError(
+                        f"Filter for an individual 'id' returned {num_data_returned['=']} results, when only 1 was expected."
+                    )
+
             num_response = num_data_returned[operator]
 
+            excluded = operator in exclusive_operators
+            # if we have all results on this page, check that the blessed ID is in the response
+            # if not response["meta"]["more_data_available"]:
+            if excluded and (
+                chosen_entry["id"] in set(entry["id"] for entry in response["data"])
+            ):
+                raise ResponseError(
+                    f"Entry {chosen_entry['id']} with value {prop!r}: {test_value} was not excluded by {query!r}"
+                )
+
+            # check that at least the archetypal structure was returned, unless we are using a fallback value
+            if not excluded and not using_fallback:
+                if (
+                    num_data_returned[operator] is not None
+                    and num_data_returned[operator] < 1
+                ):
+                    raise ResponseError(
+                        f"Supposedly inclusive query {query!r} did not include original entry ID {chosen_entry['id']!r} "
+                        f"(with field {prop!r} = {test_value}) potentially indicating a problem with filtering on this field."
+                    )
+
+            # check that the filter returned no entries that had a null or missing value for the filtered property
+            if any(
+                entry["attributes"].get(prop, entry.get(prop, None)) is None
+                for entry in response.get("data", [])
+            ):
+                raise ResponseError(
+                    f"Filter {query!r} on field {prop!r} returned entries that had null or missing values for the field."
+                )
+
             # Numeric and string comparisons must work both ways...
             if prop_type in (
                 DataType.STRING,
@@ -905,46 +943,6 @@ def _construct_single_property_filters(
                         f"Filter {reversed_query!r} on field {prop!r} returned entries that had null or missing values for the field."
                     )
 
-            excluded = operator in exclusive_operators
-            # if we have all results on this page, check that the blessed ID is in the response
-            if not response["meta"]["more_data_available"]:
-                if excluded and (
-                    chosen_entry["id"] in set(entry["id"] for entry in response["data"])
-                ):
-                    raise ResponseError(
-                        f"Object {chosen_entry['id']} with value {test_value} was not excluded by {query}"
-                    )
-
-            # check that at least the archetypal structure was returned, unless we are using a fallback value
-            if not excluded and not using_fallback:
-                if (
-                    num_data_returned[operator] is not None
-                    and num_data_returned[operator] < 1
-                ):
-                    raise ResponseError(
-                        f"Supposedly inclusive query '{query}' did not include original object ID {chosen_entry['id']} "
-                        f"(with field '{prop} = {test_value}') potentially indicating a problem with filtering on this field."
-                    )
-
-            # check that the filter returned no entries that had a null or missing value for the filtered property
-            if any(
-                entry["attributes"].get(prop, entry.get(prop, None)) is None
-                for entry in response.get("data", [])
-            ):
-                raise ResponseError(
-                    f"Filter {query!r} on field {prop!r} returned entries that had null or missing values for the field."
-                )
-
-        if prop in CONF.unique_properties:
-            if num_data_returned["="] is not None and num_data_returned["="] == 0:
-                raise ResponseError(
-                    f"Unable to filter field 'id' for equality, no data was returned for {query}."
-                )
-            if num_data_returned["="] is not None and num_data_returned["="] > 1:
-                raise ResponseError(
-                    f"Filter for an individual 'id' returned {num_data_returned['=']} results, when only 1 was expected."
-                )
-
         return True, f"{prop} passed filter tests"
 
     def _test_info_or_links_endpoint(self, request_str: str) -> Union[bool, dict]:
@@ -1237,7 +1235,10 @@ def _test_bad_version_returns_553(self) -> None:
 
     @test_case
     def _test_page_limit(
-        self, response: requests.models.Response, check_next_link: int = 5
+        self,
+        response: requests.models.Response,
+        check_next_link: int = 5,
+        previous_links: Optional[Set[str]] = None,
     ) -> Tuple[Optional[bool], str]:
         """Test that a multi-entry endpoint obeys the page limit by
         following pagination links up to a depth of `check_next_link`.
@@ -1247,12 +1248,16 @@ def _test_page_limit(
                 compliance.
             check_next_link: Maximum recursion depth for following
                 pagination links.
+            previous_links: A set of previous links that will be used
+                to check that the `next` link is actually new.
 
         Returns:
             `True` if the test was successful and `None` if not, with a
             string summary.
 
         """
+        if previous_links is None:
+            previous_links = set()
         try:
             response = response.json()
         except (AttributeError, json.JSONDecodeError):
@@ -1285,6 +1290,12 @@ def _test_page_limit(
                     "Endpoint suggested more data was available but provided no valid links->next link."
                 )
 
+            if next_link in previous_links:
+                raise ResponseError(
+                    f"The next link {next_link} has been provided already for a previous page."
+                )
+            previous_links.add(next_link)
+
             if not isinstance(next_link, str):
                 raise ResponseError(
                     f"Unable to parse links->next {next_link!r} as a link."
@@ -1302,6 +1313,7 @@ def _test_page_limit(
                 next_response,
                 check_next_link=check_next_link,
                 multistage=check_next_link,
+                previous_links=previous_links,
             )
 
         return (