From 17340dcbc3d6725ff50c168ec26d950d40ed3278 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Sat, 21 May 2022 12:04:03 +0100
Subject: [PATCH 1/4] Fix `response_fields` validation in case `page_limit` is
 not implemented

---
 optimade/validator/validator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimade/validator/validator.py b/optimade/validator/validator.py
index f34c8cf4c..343a0a184 100644
--- a/optimade/validator/validator.py
+++ b/optimade/validator/validator.py
@@ -557,7 +557,7 @@ def _check_response_fields(self, endp: str, fields: List[str]) -> Tuple[bool, st
         test_query = f"{endp}?response_fields={','.join(subset_fields)}&page_limit=1"
         response, _ = self._get_endpoint(test_query, multistage=True)
 
-        if response and len(response.json()["data"]) == 1:
+        if response and len(response.json()["data"]) >= 0:
             doc = response.json()["data"][0]
             expected_fields = set(subset_fields)
             expected_fields -= CONF.top_level_non_attribute_fields

From cf590ac809d5b9b993e915d52f2a996fa20f997d Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Sat, 21 May 2022 12:11:24 +0100
Subject: [PATCH 2/4] Harden `page_limit` tests:

- Remove CLI control of page_limit and set the default value to
something less common (4)
- Check `next` links for duplicates during pagination
---
 optimade/validator/__init__.py  |  9 +++++++--
 optimade/validator/validator.py | 20 +++++++++++++++++---
 2 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/optimade/validator/__init__.py b/optimade/validator/__init__.py
index 3148a2aa4..2e5f3b167 100644
--- a/optimade/validator/__init__.py
+++ b/optimade/validator/__init__.py
@@ -1,5 +1,6 @@
 """ This module contains the ImplementationValidator class and corresponding command line tools. """
 # pylint: disable=import-outside-toplevel
+import warnings
 from optimade import __version__, __api_version__
 from .validator import ImplementationValidator
 from .utils import DEFAULT_CONN_TIMEOUT, DEFAULT_READ_TIMEOUT
@@ -89,7 +90,7 @@ def validate():  # pragma: no cover
     parser.add_argument(
         "--page_limit",
         type=int,
-        default=5,
+        default=None,
         help="Alter the requested page limit for some tests.",
     )
 
@@ -140,6 +141,11 @@ def validate():  # pragma: no cover
     if args["as_type"] is not None and args["as_type"] not in valid_types:
         sys.exit(f"{args['as_type']} is not a valid type, must be one of {valid_types}")
 
+    if args["page_limit"] is not None:
+        warnings.warn(
+            "The `--page_limit` flag is now deprecated and will not be used by the validator."
+        )
+
     validator = ImplementationValidator(
         base_url=args["base_url"],
         verbosity=args["verbosity"],
@@ -149,7 +155,6 @@ def validate():  # pragma: no cover
         run_optional_tests=not args["skip_optional"],
         fail_fast=args["fail_fast"],
         minimal=args["minimal"],
-        page_limit=args["page_limit"],
         http_headers=args["headers"],
         timeout=args["timeout"],
         read_timeout=args["read_timeout"],
diff --git a/optimade/validator/validator.py b/optimade/validator/validator.py
index 343a0a184..2940137b3 100644
--- a/optimade/validator/validator.py
+++ b/optimade/validator/validator.py
@@ -12,7 +12,7 @@ class that can be pointed at an OPTIMADE implementation and validated
 import random
 import urllib.parse
 import dataclasses
-from typing import Union, Tuple, Any, List, Dict, Optional
+from typing import Union, Tuple, Any, List, Dict, Optional, Set
 
 try:
     import simplejson as json
@@ -70,7 +70,7 @@ def __init__(  # pylint: disable=too-many-arguments
         base_url: str = None,
         verbosity: int = 0,
         respond_json: bool = False,
-        page_limit: int = 5,
+        page_limit: int = 4,
         max_retries: int = 5,
         run_optional_tests: bool = True,
         fail_fast: bool = False,
@@ -1237,7 +1237,10 @@ def _test_bad_version_returns_553(self) -> None:
 
     @test_case
     def _test_page_limit(
-        self, response: requests.models.Response, check_next_link: int = 5
+        self,
+        response: requests.models.Response,
+        check_next_link: int = 5,
+        previous_links: Optional[Set[str]] = None,
     ) -> Tuple[Optional[bool], str]:
         """Test that a multi-entry endpoint obeys the page limit by
         following pagination links up to a depth of `check_next_link`.
@@ -1247,12 +1250,16 @@ def _test_page_limit(
                 compliance.
             check_next_link: Maximum recursion depth for following
                 pagination links.
+            previous_links: A set of previous links that will be used
+                to check that the `next` link is actually new.
 
         Returns:
             `True` if the test was successful and `None` if not, with a
             string summary.
 
         """
+        if previous_links is None:
+            previous_links = set()
         try:
             response = response.json()
         except (AttributeError, json.JSONDecodeError):
@@ -1285,6 +1292,12 @@ def _test_page_limit(
                     "Endpoint suggested more data was available but provided no valid links->next link."
                 )
 
+            if next_link in previous_links:
+                raise ResponseError(
+                    f"The next link {next_link} has been provided already for a previous page."
+                )
+            previous_links.add(next_link)
+
             if not isinstance(next_link, str):
                 raise ResponseError(
                     f"Unable to parse links->next {next_link!r} as a link."
@@ -1302,6 +1315,7 @@ def _test_page_limit(
                 next_response,
                 check_next_link=check_next_link,
                 multistage=check_next_link,
+                previous_links=previous_links,
             )
 
         return (

From 1dbe1f7338c741fc16baa782d5094d1a21073cc9 Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Sat, 21 May 2022 12:26:28 +0100
Subject: [PATCH 3/4] Do not rely on `page_offset` for filter tests

---
 optimade/validator/validator.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/optimade/validator/validator.py b/optimade/validator/validator.py
index 2940137b3..a099180c8 100644
--- a/optimade/validator/validator.py
+++ b/optimade/validator/validator.py
@@ -380,6 +380,7 @@ def _recurse_through_endpoint(self, endp: str) -> Tuple[bool, str]:
         prop_list = list(_impl_properties.keys())
 
         self._check_response_fields(endp, prop_list)
+
         chosen_entry, _ = self._get_archetypal_entry(endp, prop_list)
 
         if not chosen_entry:
@@ -524,19 +525,16 @@ def _get_archetypal_entry(
             if data_returned < 1:
                 return (
                     None,
-                    "Endpoint {endp!r} returned no entries, cannot get archetypal entry.",
+                    "Endpoint {endp!r} returned no entries, cannot get archetypal entry or test filtering.",
                 )
 
-            response, message = self._get_endpoint(
-                f"{endp}?page_offset={random.randint(0, data_returned-1)}&response_fields={','.join(properties)}",
-                multistage=True,
+            archetypal_entry = response.json()["data"][
+                random.randint(0, data_returned - 1)
+            ]
+            return (
+                archetypal_entry,
+                f"set archetypal entry for {endp} with ID {archetypal_entry['id']}.",
             )
-            if response:
-                archetypal_entry = response.json()["data"][0]
-                return (
-                    archetypal_entry,
-                    f"set archetypal entry for {endp} with ID {archetypal_entry['id']}.",
-                )
 
         raise ResponseError(f"Failed to get archetypal entry. Details: {message}")
 

From cdabed9e486e01acb74bd1a32f60855bf1e65a1a Mon Sep 17 00:00:00 2001
From: Matthew Evans <git@ml-evs.science>
Date: Sat, 21 May 2022 12:48:24 +0100
Subject: [PATCH 4/4] Reorder filter test precedence:

- Make sure optional queries don't do an 'allowed failure'
before more important non-optional checks
- Assume that the archetypal entry will always be on the first
page of results (assumes constant default sorting from the API)
rather than allowing bad results to hide in future pages
---
 optimade/validator/validator.py | 80 ++++++++++++++++-----------------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/optimade/validator/validator.py b/optimade/validator/validator.py
index a099180c8..8d57e2532 100644
--- a/optimade/validator/validator.py
+++ b/optimade/validator/validator.py
@@ -823,8 +823,48 @@ def _construct_single_property_filters(
             else:
                 num_data_returned[operator] = response["meta"].get("data_returned")
 
+            if prop in CONF.unique_properties and operator == "=":
+                if num_data_returned["="] is not None and num_data_returned["="] == 0:
+                    raise ResponseError(
+                        f"Unable to filter field 'id' for equality, no data was returned for {query}."
+                    )
+                if num_data_returned["="] is not None and num_data_returned["="] > 1:
+                    raise ResponseError(
+                        f"Filter for an individual 'id' returned {num_data_returned['=']} results, when only 1 was expected."
+                    )
+
             num_response = num_data_returned[operator]
 
+            excluded = operator in exclusive_operators
+            # if we have all results on this page, check that the blessed ID is in the response
+            # if not response["meta"]["more_data_available"]:
+            if excluded and (
+                chosen_entry["id"] in set(entry["id"] for entry in response["data"])
+            ):
+                raise ResponseError(
+                    f"Entry {chosen_entry['id']} with value {prop!r}: {test_value} was not excluded by {query!r}"
+                )
+
+            # check that at least the archetypal structure was returned, unless we are using a fallback value
+            if not excluded and not using_fallback:
+                if (
+                    num_data_returned[operator] is not None
+                    and num_data_returned[operator] < 1
+                ):
+                    raise ResponseError(
+                        f"Supposedly inclusive query {query!r} did not include original entry ID {chosen_entry['id']!r} "
+                        f"(with field {prop!r} = {test_value}) potentially indicating a problem with filtering on this field."
+                    )
+
+            # check that the filter returned no entries that had a null or missing value for the filtered property
+            if any(
+                entry["attributes"].get(prop, entry.get(prop, None)) is None
+                for entry in response.get("data", [])
+            ):
+                raise ResponseError(
+                    f"Filter {query!r} on field {prop!r} returned entries that had null or missing values for the field."
+                )
+
             # Numeric and string comparisons must work both ways...
             if prop_type in (
                 DataType.STRING,
@@ -903,46 +943,6 @@ def _construct_single_property_filters(
                         f"Filter {reversed_query!r} on field {prop!r} returned entries that had null or missing values for the field."
                     )
 
-            excluded = operator in exclusive_operators
-            # if we have all results on this page, check that the blessed ID is in the response
-            if not response["meta"]["more_data_available"]:
-                if excluded and (
-                    chosen_entry["id"] in set(entry["id"] for entry in response["data"])
-                ):
-                    raise ResponseError(
-                        f"Object {chosen_entry['id']} with value {test_value} was not excluded by {query}"
-                    )
-
-            # check that at least the archetypal structure was returned, unless we are using a fallback value
-            if not excluded and not using_fallback:
-                if (
-                    num_data_returned[operator] is not None
-                    and num_data_returned[operator] < 1
-                ):
-                    raise ResponseError(
-                        f"Supposedly inclusive query '{query}' did not include original object ID {chosen_entry['id']} "
-                        f"(with field '{prop} = {test_value}') potentially indicating a problem with filtering on this field."
-                    )
-
-            # check that the filter returned no entries that had a null or missing value for the filtered property
-            if any(
-                entry["attributes"].get(prop, entry.get(prop, None)) is None
-                for entry in response.get("data", [])
-            ):
-                raise ResponseError(
-                    f"Filter {query!r} on field {prop!r} returned entries that had null or missing values for the field."
-                )
-
-        if prop in CONF.unique_properties:
-            if num_data_returned["="] is not None and num_data_returned["="] == 0:
-                raise ResponseError(
-                    f"Unable to filter field 'id' for equality, no data was returned for {query}."
-                )
-            if num_data_returned["="] is not None and num_data_returned["="] > 1:
-                raise ResponseError(
-                    f"Filter for an individual 'id' returned {num_data_returned['=']} results, when only 1 was expected."
-                )
-
         return True, f"{prop} passed filter tests"
 
     def _test_info_or_links_endpoint(self, request_str: str) -> Union[bool, dict]: