diff --git a/Dockerfile b/Dockerfile index 28de131b..ab3adbec 100644 --- a/Dockerfile +++ b/Dockerfile @@ -3,7 +3,7 @@ # Defines the runtime for the arXiv search service, which provides the main # UIs (and, eventually, APIs) for search. -FROM arxiv/base:0.12.1rc2 +FROM arxiv/base:0.12.1 WORKDIR /opt/arxiv diff --git a/Dockerfile-agent b/Dockerfile-agent index ab160de3..ebea1c2b 100644 --- a/Dockerfile-agent +++ b/Dockerfile-agent @@ -4,7 +4,7 @@ # article metadata becomes available. Subscribes to a Kinesis stream for # notifications about new metadata. -FROM arxiv/search +FROM arxiv/search:0.5.1 WORKDIR /opt/arxiv diff --git a/Dockerfile-api b/Dockerfile-api index 918b9f1f..edd4c7ea 100644 --- a/Dockerfile-api +++ b/Dockerfile-api @@ -3,7 +3,7 @@ # Defines the runtime for the arXiv search API, which provides a metadata # query API backed by Elasticsearch. -FROM arxiv/base:0.12.1rc2 +FROM arxiv/base:0.12.1 WORKDIR /opt/arxiv diff --git a/Dockerfile-index b/Dockerfile-index index fbd048ba..daa5ba91 100644 --- a/Dockerfile-index +++ b/Dockerfile-index @@ -16,7 +16,7 @@ # # See also ELASTICSEARCH_* and METADATA_ENDPOINT parameters, below. -FROM arxiv/base:0.12.1rc2 +FROM arxiv/base:0.12.1 # Add Python consumer and configuration. ADD requirements/prod.txt /opt/arxiv/requirements.txt diff --git a/Pipfile b/Pipfile index 571da1cc..1007f418 100644 --- a/Pipfile +++ b/Pipfile @@ -4,8 +4,8 @@ verify_ssl = true name = "pypi" [packages] -arxiv-base = "==0.12.1rc2" -arxiv-auth = "==0.1.0rc14" +arxiv-auth = "==0.2.1" +arxiv-base = "==0.12.1" boto = "==2.48.0" "boto3" = "==1.6.6" botocore = "==1.9.6" diff --git a/Pipfile.lock b/Pipfile.lock index 70b87fff..1341c718 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "59b8e0253b404fbf5bc25a8b9602e3ab61a9aabd1baa2a20768d000b7dda49bd" + "sha256": "4c3241a3b556d7e9216ac347d549203b49888c9e23f529b8281f7380760b3b78" }, "pipfile-spec": 6, "requires": {}, @@ -16,17 +16,17 @@ "default": { "arxiv-auth": { "hashes": [ - "sha256:6a10165a775bdfca281a8ffac5aefe737e02b92f62c6ee95c5d793bae30dd457" + "sha256:19b5d5b4c226ef34986031f9f46836b5e7b44715db47b240b9af7c19a132cbb9" ], "index": "pypi", - "version": "==0.1.0rc14" + "version": "==0.2.1" }, "arxiv-base": { "hashes": [ - "sha256:9896b4f54d4a5e20c5f7ab7e8c65aa022781f8eba149b9730896fe22a44e0535" + "sha256:f8fa599e50550e0c6ee9de53030c22c0b8921fca7ac1268753a6b2b0fef177e9" ], "index": "pypi", - "version": "==0.12.1rc2" + "version": "==0.12.1" }, "bleach": { "hashes": [ diff --git a/schema/resources/Classification.json b/schema/resources/Classification.json new file mode 100644 index 00000000..7bb2e1cd --- /dev/null +++ b/schema/resources/Classification.json @@ -0,0 +1,9 @@ +{ + "title": "Classification", + "type": "object", + "properties": { + "archive": {"$ref": "./ClassificationTerm.json"}, + "group": {"$ref": "./ClassificationTerm.json"}, + "category": {"$ref": "./ClassificationTerm.json"} + } +} diff --git a/schema/resources/ClassificationTerm.json b/schema/resources/ClassificationTerm.json new file mode 100644 index 00000000..5f0a686b --- /dev/null +++ b/schema/resources/ClassificationTerm.json @@ -0,0 +1,13 @@ +{ + "title": "ClassificationTerm", + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": ["id", "name"] +} diff --git a/schema/resources/Document.json b/schema/resources/Document.json index c55ead5f..53330af2 100644 --- a/schema/resources/Document.json +++ b/schema/resources/Document.json @@ -111,11 +111,11 @@ "uniqueItems": true }, "primary_classification": { - "$ref": "./Document.json#/definitions/classification" + "$ref": "./Classification.json" }, "secondary_classification": { "type": "array", - "items": {"$ref": "./Document.json#/definitions/classification"} + "items": {"$ref": "./Classification.json"} }, "report_num": { "type": "string" @@ -188,51 +188,5 @@ "href", "canonical" ] - }, - "definitions": { - "category": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "archive": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "group": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "classification": { - "type": "object", - "properties": { - "archive": {"$ref": "#/definitions/archive"}, - "group": {"$ref": "#/definitions/group"}, - "category": {"$ref": "#/definitions/category"} - } - } } } diff --git a/schema/resources/DocumentMetadata.json b/schema/resources/DocumentMetadata.json index 0f6fb3ab..7a957ae9 100644 --- a/schema/resources/DocumentMetadata.json +++ b/schema/resources/DocumentMetadata.json @@ -1,52 +1,6 @@ { "title": "DocumentMetadata", "description": "Schema for arXiv document metadata provided by the docmeta endpoint.", - "definitions": { - "category": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "archive": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "group": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "name": { - "type": "string" - } - }, - "required": ["id", "name"] - }, - "classification": { - "type": "object", - "properties": { - "archive": {"$ref": "#/definitions/archive"}, - "group": {"$ref": "#/definitions/group"}, - "category": {"$ref": "#/definitions/category"} - } - } - }, "type": "object", "properties": { "abs_categories": { @@ -216,11 +170,11 @@ "type": "string" }, "primary_classification": { - "$ref": "#/definitions/classification" + "$ref": "./Classification.json" }, "secondary_classification": { "type": "array", - "items": {"$ref": "#/definitions/classification"} + "items": {"$ref": "./Classification.json"} }, "proxy": { "type": "string" diff --git a/schema/resources/DocumentSet.json b/schema/resources/DocumentSet.json index b664a7b1..27576192 100644 --- a/schema/resources/DocumentSet.json +++ b/schema/resources/DocumentSet.json @@ -18,6 +18,21 @@ "total": { "description": "Total number of documents that respond to this query.", "type": "integer" + }, + "query": { + "description": "Query parameters interpreted from the request.", + "type": "array", + "items": { + "type": "object", + "properties": { + "parameter": { + "type": "string" + }, + "value": { + "type": "string" + } + } + } } } }, @@ -25,7 +40,7 @@ "type": "array", "items": { "type": "object", - "$ref": "Document.json#Document" + "$ref": "Document.json" } } } diff --git a/schema/search.yaml b/schema/search.yaml index e77367db..8f6d3306 100644 --- a/schema/search.yaml +++ b/schema/search.yaml @@ -20,6 +20,102 @@ paths: description: | Returns all published arXiv papers that respond to the specified query parameters. By default, returns most recent papers first. + + ## Example request + + ```bash + curl \ + -H "Authorization: Bearer 4mggHnvB3ZV1bV3GObE6wZFw8pul5nGyzfeABSdfDg" \ + https://api.arxiv.org/metadata/?size=5&license=http://arxiv.org/licenses/nonexclusive-distrib/1.0/&include=license + + ``` + + ## Example response + + ```json + { + "metadata": { + "end": 5, + "query": [ + { + "parameter": "license", + "value": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/" + }, + { + "parameter": "include", + "value": "license" + } + ], + "size": 5, + "start": 0, + "total": 993119 + }, + "results": [ + { + "canonical": "https://arxiv.org/abs/1812.01565v1", + "href": "http://127.0.0.1:5000/1812.01565v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.01565", + "paper_id_v": "1812.01565v1", + "title": "Impact of radiation backgrounds on the formation of massive black holes", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03980v1", + "href": "http://127.0.0.1:5000/1812.03980v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03980", + "paper_id_v": "1812.03980v1", + "title": "Building Ethically Bounded AI", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03942v1", + "href": "http://127.0.0.1:5000/1812.03942v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03942", + "paper_id_v": "1812.03942v1", + "title": "Accurate Evaluation of $\\mathcal{P}$,$\\mathcal{T}$-odd Faraday Effect in Atoms of Xe and Hg", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03969v1", + "href": "http://127.0.0.1:5000/1812.03969v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03969", + "paper_id_v": "1812.03969v1", + "title": "Modified gravity, gravitational waves and the large-scale structure of the Universe: A brief report", + "version": 1 + }, + { + "canonical": "https://arxiv.org/abs/1812.03956v1", + "href": "http://127.0.0.1:5000/1812.03956v1", + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "paper_id": "1812.03956", + "paper_id_v": "1812.03956v1", + "title": "X-ray reflectivity with a twist: quantitative time-resolved X-ray reflectivity using monochromatic synchrotron radiation", + "version": 1 + } + ] + } + + ``` + parameters: - name: all in: query @@ -32,6 +128,7 @@ paths: explode: true schema: type: string + - name: author in: query description: | @@ -46,6 +143,8 @@ paths: explode: true schema: type: string + example: sinskaja, e n + - name: title in: query description: | @@ -56,6 +155,8 @@ paths: explode: true schema: type: string + example: "theory of life" + - name: abstract in: query description: | @@ -66,6 +167,8 @@ paths: explode: true schema: type: string + example: abstr?ct + - name: comments in: query description: | @@ -77,6 +180,8 @@ paths: explode: true schema: type: string + example: "color figures" + - name: journal_ref in: query description: | @@ -88,6 +193,8 @@ paths: explode: true schema: type: string + example: "j cool beans" + - name: report_num in: query description: | @@ -99,6 +206,8 @@ paths: explode: true schema: type: string + example: SU-4240-720 + - name: acm_class in: query description: | @@ -109,6 +218,8 @@ paths: explode: true schema: type: string + example: F.2.2 + - name: msc_class in: query description: | @@ -119,6 +230,8 @@ paths: explode: true schema: type: string + example: 14J60 + - name: doi in: query description: | @@ -129,6 +242,8 @@ paths: explode: true schema: type: string + example: 10.1016/S0550-3213(01)00405-9 + - name: paper_id in: query description: | @@ -140,6 +255,8 @@ paths: explode: true schema: type: string + example: 1601.00123 + - name: orcid in: query description: | @@ -153,6 +270,8 @@ paths: explode: true schema: type: string + example: 0000-0002-0564-9939 + - name: author_id in: query description: | @@ -166,12 +285,57 @@ paths: explode: true schema: type: string - - name: primary_category + example: warner_s_1 + + - name: primary_classification + in: query + description: | + Limit query by primary classification. This field supports + filtering by group, archive, and category. Note that group names + are prefixed by ``grp_``, e.g. ``grp_physics``, ``grp_q-bio``. See + https://arxiv.github.io/arxiv-base/arxiv/arxiv.taxonomy.html for + more information. + examples: + groupMath: + summary: Limit results to the math group. + value: grp_math + archivePhysics: + summary: Limit results to the physics archive. + value: physics + categoryHE: + summary: | + Limit results to the High Enegery Astrophysical Phenomena + category. + value: astro-ph.HE + required: false + style: form + schema: + type: array + items: + type: string + + - name: secondary_classification in: query description: | - Slug for the primary category or categories to which results - should be limited. Valid categories are defined at - https://github.com/cul-it/arxiv-base/blob/master/arxiv/taxonomy/__init__.py#L306. + Limit query by secondary (cross-list) classification. This field + supports filtering by group, archive, and category. Note that group + names are prefixed by ``grp_``, e.g. ``grp_physics``, + ``grp_q-bio``. See + https://arxiv.github.io/arxiv-base/arxiv/arxiv.taxonomy.html for + more information. + examples: + groupMath: + summary: Limit results to those cross-listed in the math group. + value: grp_math + archivePhysics: + summary: | + Limit results to those cross-listed in the physics archive. + value: physics + categoryHE: + summary: | + Limit results to those cross-listed in the High Enegery + Astrophysical Phenomena category. + value: astro-ph.HE required: false style: form explode: true @@ -179,6 +343,7 @@ paths: type: array items: type: string + - name: include in: query description: | @@ -192,6 +357,8 @@ paths: type: array items: type: string + example: title + - name: start_date in: query description: | @@ -202,6 +369,8 @@ paths: schema: type: string format: date + example: "1998-04-03" + - name: end_date in: query description: | @@ -212,6 +381,8 @@ paths: schema: type: string format: date + example: "1998-04-09" + - name: date_type in: query description: | @@ -227,6 +398,7 @@ paths: - submitted_date_first - submitted_date - announced_date_first + example: submitted_date_first responses: '200': @@ -247,6 +419,150 @@ paths: Get metadata about an arXiv paper by arXiv ID. See https://arxiv.org/help/arxiv_identifier for information about arXiv paper identifiers. + + ## Example request + + ```bash + curl \ + -H "Authorization: Bearer 4mggHnvB3ZV1bV3GOaE8wZFw8pul5nGyzfeABSdfDg" \ + https://api.arxiv.org/metadata/1811.00536v1 + ``` + + ## Example response + ```json + { + "abs_categories": "cond-mat.str-el hep-th math-ph math.MP", + "abstract": "We give a systematic construction and classification of fermionic symmetry-protected topological states for generic fermionic symmetry group $G_f=\\mathbb Z_2^f\\rtimes G_b$, which is a central extension of bosonic symmetry group $G_b$ (may contain time reversal symmetry) by the fermion parity symmetry group $\\mathbb Z_2^f=\\{1,P_f\\}$. For each class in the classification (except those with 2D $p+ip$ chiral superconductor decorations), we construct a fixed-point wave function which admits exactly solvable commuting-projector Hamiltonian. The classification is based on the notion of equivalence class of fermionic symmetric local unitary transformations.", + "acm_class": [], + "announced_date_first": "2018-11", + "authors": [ + { + "affiliation": [], + "author_id": null, + "first_name": "Qing-Rui", + "full_name": "Qing-Rui Wang", + "last_name": "Wang", + "orcid": null, + "suffix": "" + }, + { + "affiliation": [], + "author_id": null, + "first_name": "Zheng-Cheng", + "full_name": "Zheng-Cheng Gu", + "last_name": "Gu", + "orcid": null, + "suffix": "" + } + ], + "authors_freeform": "Qing-Rui Wang and Zheng-Cheng Gu", + "canonical": "https://arxiv.org/abs/1811.00536v1", + "comments": "68 pages, 16 figures", + "doi": [], + "formats": [ + { + "format": "pdf", + "href": "https://arxiv.org/pdf/1811.00536v1" + }, + { + "format": "other", + "href": "https://arxiv.org/format/1811.00536v1" + } + ], + "href": "https://api.arxiv.org/metadata/1811.00536v1", + "is_current": true, + "is_withdrawn": false, + "journal_ref": "", + "latest": { + "canonical": "https://arxiv.org/abs/1811.00536v1", + "href": "https://api.arxiv.org/metadata/1811.00536v1", + "paper_id": "1811.00536v1", + "version": 1 + }, + "license": { + "href": "http://arxiv.org/licenses/nonexclusive-distrib/1.0/", + "label": "arXiv.org perpetual, non-exclusive license to distribute this article" + }, + "msc_class": [], + "owners": [ + { + "affiliation": [], + "author_id": null, + "first_name": "Qing-Rui", + "full_name": "Qing-Rui Wang", + "last_name": "Wang", + "orcid": null, + "suffix": "" + } + ], + "paper_id": "1811.00536", + "paper_id_v": "1811.00536v1", + "primary_classification": { + "archive": { + "id": "cond-mat", + "name": "Condensed Matter" + }, + "category": { + "id": "cond-mat.str-el", + "name": "Strongly Correlated Electrons" + }, + "group": { + "id": "grp_physics", + "name": "Physics" + } + }, + "report_num": "", + "secondary_classification": [ + { + "archive": { + "id": "hep-th", + "name": "High Energy Physics - Theory" + }, + "category": { + "id": "hep-th", + "name": "High Energy Physics - Theory" + }, + "group": { + "id": "grp_physics", + "name": "Physics" + } + }, + { + "archive": { + "id": "math-ph", + "name": "Mathematical Physics" + }, + "category": { + "id": "math-ph", + "name": "Mathematical Physics" + }, + "group": { + "id": "grp_physics", + "name": "Physics" + } + } + ], + "source": { + "flags": "", + "format": "pdftex", + "size_bytes": 2478991 + }, + "submitted_date": "2018-11-01T13:57:22-04:00", + "submitted_date_first": "2018-11-01T13:57:22-04:00", + "submitter": { + "affiliation": [], + "author_id": null, + "first_name": "", + "full_name": "Qing-Rui Wang", + "last_name": "", + "orcid": null, + "suffix": "" + }, + "title": "General group super-cohomology theory of fermionic symmetry-protected topological phases", + "version": 1 + } + ``` + operationId: getPaperByID parameters: - name: id diff --git a/search/controllers/api/__init__.py b/search/controllers/api/__init__.py index 9b6c5b47..d689f4e7 100644 --- a/search/controllers/api/__init__.py +++ b/search/controllers/api/__init__.py @@ -45,23 +45,35 @@ def search(params: MultiDict) -> Tuple[Dict[str, Any], int, Dict[str, Any]]: Extra headers for the response. """ q = APIQuery() - terms = _get_fielded_terms(params) + query_terms: List[Dict[str, Any]] = [] + terms = _get_fielded_terms(params, query_terms) if terms is not None: q.terms = terms - date_range = _get_date_params(params) + date_range = _get_date_params(params, query_terms) if date_range is not None: q.date_range = date_range - classifications = _get_classifications(params) - if classifications is not None: - q.classification = classifications - - include_fields = _get_include_fields(params) + primary = params.get('primary_classification') + if primary: + primary_classification = _get_classification(primary, + 'primary_classification', + query_terms) + q.primary_classification = primary_classification + + secondaries = params.getlist('secondary_classification') + if secondaries: + q.secondary_classification = [ + _get_classification(sec, 'secondary_classification', query_terms) + for sec in secondaries + ] + + include_fields = _get_include_fields(params, query_terms) if include_fields: q.include_fields += include_fields q = paginate(q, params) # type: ignore document_set = index.search(q, highlight=False) + document_set.metadata['query'] = query_terms logger.debug('Got document set with %i results', len(document_set.results)) return {'results': document_set, 'query': q}, status.HTTP_200_OK, {} @@ -98,19 +110,24 @@ def paper(paper_id: str) -> Tuple[Dict[str, Any], int, Dict[str, Any]]: return {'results': document}, status.HTTP_200_OK, {} -def _get_include_fields(params: MultiDict) -> List[str]: +def _get_include_fields(params: MultiDict, query_terms: List) -> List[str]: include_fields = params.getlist('include') allowed_fields = Document.fields() if include_fields: - return [field for field in include_fields if field in allowed_fields] + inc = [field for field in include_fields if field in allowed_fields] + for field in inc: + query_terms.append({'parameter': 'include', 'value': field}) + return inc return [] -def _get_fielded_terms(params: MultiDict) -> Optional[FieldedSearchList]: +def _get_fielded_terms(params: MultiDict, query_terms: List) \ + -> Optional[FieldedSearchList]: terms = FieldedSearchList() for field, _ in Query.SUPPORTED_FIELDS: values = params.getlist(field) for value in values: + query_terms.append({'parameter': field, 'value': value}) terms.append(FieldedSearchTerm( # type: ignore operator='AND', field=field, @@ -121,7 +138,8 @@ def _get_fielded_terms(params: MultiDict) -> Optional[FieldedSearchList]: return terms -def _get_date_params(params: MultiDict) -> Optional[DateRange]: +def _get_date_params(params: MultiDict, query_terms: List) \ + -> Optional[DateRange]: date_params = {} for field in ['start_date', 'end_date']: value = params.getlist(field) @@ -135,22 +153,48 @@ def _get_date_params(params: MultiDict) -> Optional[DateRange]: except ValueError: raise BadRequest({'field': field, 'reason': 'invalid datetime'}) date_params[field] = dt + query_terms.append({'parameter': field, 'value': dt}) if 'date_type' in params: date_params['date_type'] = params.get('date_type') + query_terms.append({'parameter': 'date_type', + 'value': date_params['date_type']}) if date_params: return DateRange(**date_params) # type: ignore return None -def _get_classifications(params: MultiDict) -> Optional[ClassificationList]: - classifications = ClassificationList() - for value in params.getlist('primary_classification'): - if value not in taxonomy.ARCHIVES: - raise BadRequest({ - 'field': 'primary_classification', - 'reason': 'not a valid archive' - }) - classifications.append(Classification(archive={'id': value})) # type: ignore - if len(classifications) == 0: - return None - return classifications +def _to_classification(value: str, query_terms: List) \ + -> Tuple[Classification, ...]: + clsns = [] + if value in taxonomy.definitions.GROUPS: + klass = taxonomy.Group + field = 'group' + elif value in taxonomy.definitions.ARCHIVES: + klass = taxonomy.Archive + field = 'archive' + elif value in taxonomy.definitions.CATEGORIES: + klass = taxonomy.Category + field = 'category' + else: + raise ValueError('not a valid classification') + cast_value = klass(value) + clsns.append(Classification(**{field: {'id': value}})) # type: ignore + if cast_value.unalias() != cast_value: + clsns.append(Classification(**{field: {'id': cast_value.unalias()}})) # type: ignore + if cast_value.canonical != cast_value \ + and cast_value.canonical != cast_value.unalias(): + clsns.append(Classification(**{field: {'id': cast_value.canonical}})) # type: ignore + return tuple(clsns) + + +def _get_classification(value: str, field: str, query_terms: List) \ + -> Tuple[Classification, ...]: + try: + clsns = _to_classification(value, query_terms) + except ValueError: + raise BadRequest({ + 'field': field, + 'reason': 'not a valid classification term' + }) + query_terms.append({'parameter': field, 'value': value}) + return clsns diff --git a/search/controllers/api/tests.py b/search/controllers/api/tests.py new file mode 100644 index 00000000..6b9cb24b --- /dev/null +++ b/search/controllers/api/tests.py @@ -0,0 +1,144 @@ +"""Tests for advanced search controller, :mod:`search.controllers.advanced`.""" + +from unittest import TestCase, mock +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +from werkzeug import MultiDict +from werkzeug.exceptions import InternalServerError, BadRequest + +from arxiv import status + +from search.domain import Query, DateRange, FieldedSearchTerm, Classification,\ + AdvancedQuery, DocumentSet +from search.controllers import api +from search.domain import api as api_domain +from search.services.index import IndexConnectionError, QueryError + + +class TestAPISearch(TestCase): + """Tests for :func:`.api.search`.""" + + @mock.patch(f'{api.__name__}.index') + def test_no_params(self, mock_index): + """Request with no parameters.""" + params = MultiDict({}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + self.assertIn("results", data, "Results are returned") + self.assertIn("query", data, "Query object is returned") + expected_fields = api_domain.get_required_fields() \ + + api_domain.get_default_extra_fields() + self.assertEqual(set(data["query"].include_fields), + set(expected_fields), + "Default set of fields is included") + + @mock.patch(f'{api.__name__}.index') + def test_include_fields(self, mock_index): + """Request with specific fields included.""" + extra_fields = ['title', 'abstract', 'authors'] + params = MultiDict({'include': extra_fields}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + self.assertIn("results", data, "Results are returned") + self.assertIn("query", data, "Query object is returned") + expected_fields = api_domain.get_required_fields() + extra_fields + self.assertEqual(set(data["query"].include_fields), + set(expected_fields), + "Requested fields are included") + + @mock.patch(f'{api.__name__}.index') + def test_group_primary_classification(self, mock_index): + """Request with a group as primary classification.""" + group = 'grp_physics' + params = MultiDict({'primary_classification': group}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 1) + self.assertEqual(query.primary_classification[0], + Classification(group={'id': group})) + + @mock.patch(f'{api.__name__}.index') + def test_archive_primary_classification(self, mock_index): + """Request with an archive as primary classification.""" + archive = 'physics' + params = MultiDict({'primary_classification': archive}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 1) + self.assertEqual(query.primary_classification[0], + Classification(archive={'id': archive})) + + @mock.patch(f'{api.__name__}.index') + def test_archive_subsumed_classification(self, mock_index): + """Request with a subsumed archive as primary classification.""" + archive = 'chao-dyn' + params = MultiDict({'primary_classification': archive}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 2) + self.assertEqual(query.primary_classification[0], + Classification(archive={'id': archive})) + self.assertEqual(query.primary_classification[1], + Classification(archive={'id': 'nlin.CD'}), + "The canonical archive is used instead") + + @mock.patch(f'{api.__name__}.index') + def test_category_primary_classification(self, mock_index): + """Request with a category as primary classification.""" + category = 'cs.DL' + params = MultiDict({'primary_classification': category}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertEqual(len(query.primary_classification), 1) + self.assertEqual(query.primary_classification[0], + Classification(category={'id': category})) + + @mock.patch(f'{api.__name__}.index') + def test_bad_classification(self, mock_index): + """Request with nonsense as primary classification.""" + params = MultiDict({'primary_classification': 'nonsense'}) + with self.assertRaises(BadRequest): + api.search(params) + + @mock.patch(f'{api.__name__}.index') + def test_with_start_date(self, mock_index): + """Request with dates specified.""" + params = MultiDict({'start_date': '1999-01-02'}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertIsNotNone(query.date_range) + self.assertEqual(query.date_range.start_date.year, 1999) + self.assertEqual(query.date_range.start_date.month, 1) + self.assertEqual(query.date_range.start_date.day, 2) + self.assertEqual(query.date_range.date_type, + DateRange.SUBMITTED_CURRENT, + "Submitted date of current version is the default") + + @mock.patch(f'{api.__name__}.index') + def test_with_end_dates_and_type(self, mock_index): + """Request with end date and date type specified.""" + params = MultiDict({'end_date': '1999-01-02', + 'date_type': 'announced_date_first'}) + data, code, headers = api.search(params) + + self.assertEqual(code, status.HTTP_200_OK, "Returns 200 OK") + query = mock_index.search.call_args[0][0] + self.assertIsNotNone(query.date_range) + self.assertEqual(query.date_range.end_date.year, 1999) + self.assertEqual(query.date_range.end_date.month, 1) + self.assertEqual(query.date_range.end_date.day, 2) + + self.assertEqual(query.date_range.date_type, + DateRange.ANNOUNCED) diff --git a/search/domain/api.py b/search/domain/api.py index a1bae351..1d7604c9 100644 --- a/search/domain/api.py +++ b/search/domain/api.py @@ -1,10 +1,10 @@ """API-specific domain classes.""" -from .base import DateRange, Query, ClassificationList, List +from .base import DateRange, Query, ClassificationList, Classification, List from .advanced import FieldedSearchList, FieldedSearchTerm from dataclasses import dataclass, field -from typing import NamedTuple, Optional +from typing import NamedTuple, Optional, Tuple def get_default_extra_fields() -> List[str]: @@ -26,10 +26,13 @@ class APIQuery(Query): """ date_range: Optional[DateRange] = None - classification: ClassificationList = field( - default_factory=ClassificationList + primary_classification: Tuple[Classification, ...] = \ + field(default_factory=tuple) + """Limit results to a specific primary classification.""" + secondary_classification: List[Tuple[Classification, ...]] = field( + default_factory=list ) - include_cross_list: bool = field(default=True) + """Limit results by cross-list classification.""" terms: FieldedSearchList = field(default_factory=FieldedSearchList) include_fields: List[str] = field(default_factory=get_default_extra_fields) diff --git a/search/routes/api/serialize.py b/search/routes/api/serialize.py index 369b1c94..910d172a 100644 --- a/search/routes/api/serialize.py +++ b/search/routes/api/serialize.py @@ -148,6 +148,7 @@ def serialize(cls, document_set: DocumentSet, 'end': document_set.metadata.get('end'), 'size': document_set.metadata.get('size'), 'total': document_set.metadata.get('total'), + 'query': document_set.metadata.get('query', []) }, }) return serialized diff --git a/search/services/index/__init__.py b/search/services/index/__init__.py index 1216836c..dba9db93 100644 --- a/search/services/index/__init__.py +++ b/search/services/index/__init__.py @@ -40,6 +40,7 @@ from .util import MAX_RESULTS from .advanced import advanced_search from .simple import simple_search +from .api import api_search from . import highlighting from . import results @@ -394,10 +395,12 @@ def search(self, query: Query, highlight: bool = True) -> DocumentSet: logger.debug('got current search request %s', str(query)) current_search = self._base_search() try: - if isinstance(query, AdvancedQuery) or isinstance(query, APIQuery): + if isinstance(query, AdvancedQuery): current_search = advanced_search(current_search, query) elif isinstance(query, SimpleQuery): current_search = simple_search(current_search, query) + elif isinstance(query, APIQuery): + current_search = api_search(current_search, query) except TypeError as e: raise e # logger.error('Malformed query: %s', str(e)) diff --git a/search/services/index/advanced.py b/search/services/index/advanced.py index ebedda22..d33fa3d1 100644 --- a/search/services/index/advanced.py +++ b/search/services/index/advanced.py @@ -8,14 +8,13 @@ from elasticsearch_dsl import Search, Q, SF from elasticsearch_dsl.query import Range, Match, Bool -from search.domain import AdvancedQuery, Classification, APIQuery +from search.domain import AdvancedQuery, Classification from .prepare import SEARCH_FIELDS, limit_by_classification from .util import sort -def advanced_search(search: Search, - query: Union[AdvancedQuery, APIQuery]) -> Search: +def advanced_search(search: Search, query: AdvancedQuery) -> Search: """ Prepare a :class:`.Search` from a :class:`.AdvancedQuery`. @@ -23,9 +22,8 @@ def advanced_search(search: Search, ---------- search : :class:`.Search` An Elasticsearch search in preparation. - query : :class:`.AdvancedQuery` or :class:`APIQuery` - An advanced query, originating from the advanced search controller or - the API, respectively. + query : :class:`.AdvancedQuery` + A query originating from the advanced search UI. Returns ------- @@ -59,7 +57,7 @@ def advanced_search(search: Search, return search -def _date_range(q: Union[AdvancedQuery, APIQuery]) -> Range: +def _date_range(q: AdvancedQuery) -> Range: """Generate a query part for a date range.""" if not q.date_range: return Q() @@ -106,7 +104,7 @@ def _get_operator(obj: Any) -> str: return obj.operator # type: ignore -def _group_terms(query: Union[AdvancedQuery, APIQuery]) -> tuple: +def _group_terms(query: AdvancedQuery) -> tuple: """Group fielded search terms into a set of nested tuples.""" terms = query.terms[:] for operator in ['NOT', 'AND', 'OR']: @@ -121,7 +119,7 @@ def _group_terms(query: Union[AdvancedQuery, APIQuery]) -> tuple: return terms[0] # type: ignore -def _fielded_terms_to_q(query: Union[AdvancedQuery, APIQuery]) -> Match: +def _fielded_terms_to_q(query: AdvancedQuery) -> Match: if len(query.terms) == 1: return SEARCH_FIELDS[query.terms[0].field](query.terms[0].term) elif len(query.terms) > 1: diff --git a/search/services/index/api.py b/search/services/index/api.py new file mode 100644 index 00000000..1d76c122 --- /dev/null +++ b/search/services/index/api.py @@ -0,0 +1,132 @@ +"""Supports the advanced search feature.""" + +from typing import Any, Union + +from functools import reduce, wraps +from operator import ior, iand + +from elasticsearch_dsl import Search, Q, SF +from elasticsearch_dsl.query import Range, Match, Bool + +from search.domain import Classification, APIQuery + +from .prepare import SEARCH_FIELDS, query_primary_exact, query_secondary_exact +from .util import sort + + +def api_search(search: Search, query: APIQuery) -> Search: + """ + Prepare a :class:`.Search` from a :class:`.APIQuery`. + + Parameters + ---------- + search : :class:`.Search` + An Elasticsearch search in preparation. + query : :class:`.APIQuery` + An query originating from the API. + + Returns + ------- + :class:`.Search` + The passed ES search object, updated with specific query parameters + that implement the advanced query. + + """ + # Classification and date are treated as filters; this foreshadows the + # behavior of faceted search. + if not query.include_older_versions: + search = search.filter("term", is_current=True) + + _q_clsn = Q() + if query.primary_classification: + _q_clsn &= reduce(ior, map(query_primary_exact, + list(query.primary_classification))) + if query.secondary_classification: + for classification in query.secondary_classification: + _q_clsn &= reduce(ior, map(query_secondary_exact, + list(classification))) + q = ( + _fielded_terms_to_q(query) + & _date_range(query) + & _q_clsn + ) + if query.order is None or query.order == 'relevance': + # Boost the current version heavily when sorting by relevance. + q = Q('function_score', query=q, boost=5, boost_mode="multiply", + score_mode="max", + functions=[ + SF({'weight': 5, 'filter': Q('term', is_current=True)}) + ]) + search = sort(query, search) + search = search.query(q) + return search + + +def _date_range(q: APIQuery) -> Range: + """Generate a query part for a date range.""" + if not q.date_range: + return Q() + params = {} + if q.date_range.date_type == q.date_range.ANNOUNCED: + fmt = '%Y-%m' + else: + fmt = '%Y-%m-%dT%H:%M:%S%z' + if q.date_range.start_date: + params["gte"] = q.date_range.start_date.strftime(fmt) + if q.date_range.end_date: + params["lt"] = q.date_range.end_date.strftime(fmt) + return Q('range', **{q.date_range.date_type: params}) + + +def _grouped_terms_to_q(term_pair: tuple) -> Q: + """Generate a :class:`.Q` from grouped terms.""" + term_a_raw, operator, term_b_raw = term_pair + + if type(term_a_raw) is tuple: + term_a = _grouped_terms_to_q(term_a_raw) + else: + term_a = SEARCH_FIELDS[term_a_raw.field](term_a_raw.term) + + if type(term_b_raw) is tuple: + term_b = _grouped_terms_to_q(term_b_raw) + else: + term_b = SEARCH_FIELDS[term_b_raw.field](term_b_raw.term) + + if operator == 'OR': + return term_a | term_b + elif operator == 'AND': + return term_a & term_b + elif operator == 'NOT': + return term_a & ~term_b + else: + # TODO: Confirm proper exception. + raise TypeError("Invalid operator for terms") + + +def _get_operator(obj: Any) -> str: + if type(obj) is tuple: + return _get_operator(obj[0]) + return obj.operator # type: ignore + + +def _group_terms(query: APIQuery) -> tuple: + """Group fielded search terms into a set of nested tuples.""" + terms = query.terms[:] + for operator in ['NOT', 'AND', 'OR']: + i = 0 + while i < len(terms) - 1: + if _get_operator(terms[i+1]) == operator: + terms[i] = (terms[i], operator, terms[i+1]) + terms.pop(i+1) + i -= 1 + i += 1 + assert len(terms) == 1 + return terms[0] # type: ignore + + +def _fielded_terms_to_q(query: APIQuery) -> Match: + if len(query.terms) == 1: + return SEARCH_FIELDS[query.terms[0].field](query.terms[0].term) + elif len(query.terms) > 1: + return _grouped_terms_to_q(_group_terms(query)) + return Q('match_all') diff --git a/search/services/index/prepare.py b/search/services/index/prepare.py index eed44d26..5ae48294 100644 --- a/search/services/index/prepare.py +++ b/search/services/index/prepare.py @@ -123,6 +123,27 @@ def _query_primary(term: str, operator: str = 'and') -> Q: }) +def query_primary_exact(classification: Classification) -> Q: + """Generate a :class:`Q` for primary classification by ID.""" + return reduce(iand, [ + Q("match", **{f"primary_classification__{field}__id": + getattr(classification, field)['id']}) + for field in ['group', 'archive', 'category'] + if getattr(classification, field, None) is not None + ]) + + +def query_secondary_exact(classification: Classification) -> Q: + """Generate a :class:`Q` for secondary classification by ID.""" + return Q("nested", path="secondary_classification", + query=reduce(iand, [ + Q("match", **{f"secondary_classification__{field}__id": + getattr(classification, field)['id']}) + for field in ['group', 'archive', 'category'] + if getattr(classification, field, None) is not None + ])) + + def _query_secondary(term: str, operator: str = 'and') -> Q: return Q( "nested",