Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Harden validator for partially implemented APIs #1181

Merged
merged 4 commits into from
May 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions optimade/validator/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
""" This module contains the ImplementationValidator class and corresponding command line tools. """
# pylint: disable=import-outside-toplevel
import warnings
from optimade import __version__, __api_version__
from .validator import ImplementationValidator
from .utils import DEFAULT_CONN_TIMEOUT, DEFAULT_READ_TIMEOUT
Expand Down Expand Up @@ -89,7 +90,7 @@ def validate(): # pragma: no cover
parser.add_argument(
"--page_limit",
type=int,
default=5,
default=None,
help="Alter the requested page limit for some tests.",
)

Expand Down Expand Up @@ -140,6 +141,11 @@ def validate(): # pragma: no cover
if args["as_type"] is not None and args["as_type"] not in valid_types:
sys.exit(f"{args['as_type']} is not a valid type, must be one of {valid_types}")

if args["page_limit"] is not None:
warnings.warn(
"The `--page_limit` flag is now deprecated and will not be used by the validator."
)

validator = ImplementationValidator(
base_url=args["base_url"],
verbosity=args["verbosity"],
Expand All @@ -149,7 +155,6 @@ def validate(): # pragma: no cover
run_optional_tests=not args["skip_optional"],
fail_fast=args["fail_fast"],
minimal=args["minimal"],
page_limit=args["page_limit"],
http_headers=args["headers"],
timeout=args["timeout"],
read_timeout=args["read_timeout"],
Expand Down
120 changes: 66 additions & 54 deletions optimade/validator/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ class that can be pointed at an OPTIMADE implementation and validated
import random
import urllib.parse
import dataclasses
from typing import Union, Tuple, Any, List, Dict, Optional
from typing import Union, Tuple, Any, List, Dict, Optional, Set

try:
import simplejson as json
Expand Down Expand Up @@ -70,7 +70,7 @@ def __init__( # pylint: disable=too-many-arguments
base_url: str = None,
verbosity: int = 0,
respond_json: bool = False,
page_limit: int = 5,
page_limit: int = 4,
max_retries: int = 5,
run_optional_tests: bool = True,
fail_fast: bool = False,
Expand Down Expand Up @@ -380,6 +380,7 @@ def _recurse_through_endpoint(self, endp: str) -> Tuple[bool, str]:
prop_list = list(_impl_properties.keys())

self._check_response_fields(endp, prop_list)

chosen_entry, _ = self._get_archetypal_entry(endp, prop_list)

if not chosen_entry:
Expand Down Expand Up @@ -524,19 +525,16 @@ def _get_archetypal_entry(
if data_returned < 1:
return (
None,
"Endpoint {endp!r} returned no entries, cannot get archetypal entry.",
"Endpoint {endp!r} returned no entries, cannot get archetypal entry or test filtering.",
)

response, message = self._get_endpoint(
f"{endp}?page_offset={random.randint(0, data_returned-1)}&response_fields={','.join(properties)}",
multistage=True,
archetypal_entry = response.json()["data"][
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change removes the need for page_offset to be implemented by the API, and instead just chooses a random result from the first page of the API results to use for the filter tests.

random.randint(0, data_returned - 1)
]
return (
archetypal_entry,
f"set archetypal entry for {endp} with ID {archetypal_entry['id']}.",
)
if response:
archetypal_entry = response.json()["data"][0]
return (
archetypal_entry,
f"set archetypal entry for {endp} with ID {archetypal_entry['id']}.",
)

raise ResponseError(f"Failed to get archetypal entry. Details: {message}")

Expand All @@ -557,7 +555,7 @@ def _check_response_fields(self, endp: str, fields: List[str]) -> Tuple[bool, st
test_query = f"{endp}?response_fields={','.join(subset_fields)}&page_limit=1"
response, _ = self._get_endpoint(test_query, multistage=True)

if response and len(response.json()["data"]) == 1:
if response and len(response.json()["data"]) >= 0:
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was previously allowing databases that did not support page_limit to skip this test

doc = response.json()["data"][0]
expected_fields = set(subset_fields)
expected_fields -= CONF.top_level_non_attribute_fields
Expand Down Expand Up @@ -825,8 +823,48 @@ def _construct_single_property_filters(
else:
num_data_returned[operator] = response["meta"].get("data_returned")

if prop in CONF.unique_properties and operator == "=":
Copy link
Member Author

@ml-evs ml-evs May 21, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code block was just moved up from below the reversed query tests, with this line being the only change (previously it assumed "=" was already in the dict).

if num_data_returned["="] is not None and num_data_returned["="] == 0:
raise ResponseError(
f"Unable to filter field 'id' for equality, no data was returned for {query}."
)
if num_data_returned["="] is not None and num_data_returned["="] > 1:
raise ResponseError(
f"Filter for an individual 'id' returned {num_data_returned['=']} results, when only 1 was expected."
)

num_response = num_data_returned[operator]

excluded = operator in exclusive_operators
# if we have all results on this page, check that the blessed ID is in the response
# if not response["meta"]["more_data_available"]:
if excluded and (
chosen_entry["id"] in set(entry["id"] for entry in response["data"])
):
raise ResponseError(
f"Entry {chosen_entry['id']} with value {prop!r}: {test_value} was not excluded by {query!r}"
)

# check that at least the archetypal structure was returned, unless we are using a fallback value
if not excluded and not using_fallback:
if (
num_data_returned[operator] is not None
and num_data_returned[operator] < 1
):
raise ResponseError(
f"Supposedly inclusive query {query!r} did not include original entry ID {chosen_entry['id']!r} "
f"(with field {prop!r} = {test_value}) potentially indicating a problem with filtering on this field."
)

# check that the filter returned no entries that had a null or missing value for the filtered property
if any(
entry["attributes"].get(prop, entry.get(prop, None)) is None
for entry in response.get("data", [])
):
raise ResponseError(
f"Filter {query!r} on field {prop!r} returned entries that had null or missing values for the field."
)

# Numeric and string comparisons must work both ways...
if prop_type in (
DataType.STRING,
Expand Down Expand Up @@ -905,46 +943,6 @@ def _construct_single_property_filters(
f"Filter {reversed_query!r} on field {prop!r} returned entries that had null or missing values for the field."
)

excluded = operator in exclusive_operators
# if we have all results on this page, check that the blessed ID is in the response
if not response["meta"]["more_data_available"]:
if excluded and (
chosen_entry["id"] in set(entry["id"] for entry in response["data"])
):
raise ResponseError(
f"Object {chosen_entry['id']} with value {test_value} was not excluded by {query}"
)

# check that at least the archetypal structure was returned, unless we are using a fallback value
if not excluded and not using_fallback:
if (
num_data_returned[operator] is not None
and num_data_returned[operator] < 1
):
raise ResponseError(
f"Supposedly inclusive query '{query}' did not include original object ID {chosen_entry['id']} "
f"(with field '{prop} = {test_value}') potentially indicating a problem with filtering on this field."
)

# check that the filter returned no entries that had a null or missing value for the filtered property
if any(
entry["attributes"].get(prop, entry.get(prop, None)) is None
for entry in response.get("data", [])
):
raise ResponseError(
f"Filter {query!r} on field {prop!r} returned entries that had null or missing values for the field."
)

if prop in CONF.unique_properties:
if num_data_returned["="] is not None and num_data_returned["="] == 0:
raise ResponseError(
f"Unable to filter field 'id' for equality, no data was returned for {query}."
)
if num_data_returned["="] is not None and num_data_returned["="] > 1:
raise ResponseError(
f"Filter for an individual 'id' returned {num_data_returned['=']} results, when only 1 was expected."
)

return True, f"{prop} passed filter tests"

def _test_info_or_links_endpoint(self, request_str: str) -> Union[bool, dict]:
Expand Down Expand Up @@ -1237,7 +1235,10 @@ def _test_bad_version_returns_553(self) -> None:

@test_case
def _test_page_limit(
self, response: requests.models.Response, check_next_link: int = 5
self,
response: requests.models.Response,
check_next_link: int = 5,
previous_links: Optional[Set[str]] = None,
) -> Tuple[Optional[bool], str]:
"""Test that a multi-entry endpoint obeys the page limit by
following pagination links up to a depth of `check_next_link`.
Expand All @@ -1247,12 +1248,16 @@ def _test_page_limit(
compliance.
check_next_link: Maximum recursion depth for following
pagination links.
previous_links: A set of previous links that will be used
to check that the `next` link is actually new.

Returns:
`True` if the test was successful and `None` if not, with a
string summary.

"""
if previous_links is None:
previous_links = set()
try:
response = response.json()
except (AttributeError, json.JSONDecodeError):
Expand Down Expand Up @@ -1285,6 +1290,12 @@ def _test_page_limit(
"Endpoint suggested more data was available but provided no valid links->next link."
)

if next_link in previous_links:
raise ResponseError(
f"The next link {next_link} has been provided already for a previous page."
)
previous_links.add(next_link)

Comment on lines +1293 to +1298
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previously, it was valid to return the same next link for every page and get stuck in a cycle

if not isinstance(next_link, str):
raise ResponseError(
f"Unable to parse links->next {next_link!r} as a link."
Expand All @@ -1302,6 +1313,7 @@ def _test_page_limit(
next_response,
check_next_link=check_next_link,
multistage=check_next_link,
previous_links=previous_links,
)

return (
Expand Down