From 473909cf7321314dab7fb79dbc6e3a042468463f Mon Sep 17 00:00:00 2001 From: erickpeirson Date: Thu, 19 Apr 2018 14:49:10 -0400 Subject: [PATCH 01/77] ARXIVNG-281 moved Kinesis BaseConsumer to arxiv-base --- Pipfile | 3 +- Pipfile.lock | 9 +- search/agent/__init__.py | 37 +- search/agent/base.py | 352 ------------------ search/agent/consumer.py | 5 +- search/agent/tests/test_base_consumer.py | 166 --------- .../tests/{tests.py => test_integration.py} | 2 +- 7 files changed, 11 insertions(+), 563 deletions(-) delete mode 100644 search/agent/base.py delete mode 100644 search/agent/tests/test_base_consumer.py rename search/agent/tests/{tests.py => test_integration.py} (99%) diff --git a/Pipfile b/Pipfile index 7f4fd3f5..12313eb8 100644 --- a/Pipfile +++ b/Pipfile @@ -7,7 +7,7 @@ name = "pypi" [packages] -arxiv-base = "==0.5.2" +"arxiv-base"==0.6 boto = "==2.48.0" "boto3" = "==1.6.6" botocore = "==1.9.6" @@ -50,6 +50,7 @@ typed-ast = "==1.1.0" werkzeug = "==0.13" wtforms = "==2.1" bleach = "*" +"17e504d" = {path = "./../arxiv-base-ui"} [dev-packages] diff --git a/Pipfile.lock b/Pipfile.lock index e3b87842..7414bbb3 100644 --- a/Pipfile.lock +++ b/Pipfile.lock @@ -1,7 +1,7 @@ { "_meta": { "hash": { - "sha256": "f22ed549999fe9d8243b6323183835e709216454c28c54fc5248560c36379cf3" + "sha256": "835437127f096ee2dfbac653520248f8a84fd703f3652183f07e82b6f49612c4" }, "host-environment-markers": { "implementation_name": "cpython", @@ -27,11 +27,8 @@ ] }, "default": { - "arxiv-base": { - "hashes": [ - "sha256:73afac321bd5a358ea63ba3dddeaa63834a22f44afb8b4872c73aa1a857c7a88" - ], - "version": "==0.5.2" + "17e504d": { + "path": "./../arxiv-base-ui" }, "bleach": { "hashes": [ diff --git a/search/agent/__init__.py b/search/agent/__init__.py index ad7cef49..2f81e130 100644 --- a/search/agent/__init__.py +++ b/search/agent/__init__.py @@ -9,17 +9,11 @@ and becomes available for discovery via :mod:`search.routes.ui`. """ from typing import Optional -from datetime import datetime -import warnings from flask import current_app as app -from arxiv.base import logging +from arxiv.base import agent from .consumer import MetadataRecordProcessor, DocumentFailed, IndexingFailed -from .base import CheckpointManager - -logger = logging.getLogger(__name__) -logger.propagate = False def process_stream(duration: Optional[int] = None) -> None: @@ -35,30 +29,5 @@ def process_stream(duration: Optional[int] = None) -> None: """ # We use the Flask application instance for configuration, and to manage # integrations with metadata service, search index. - with warnings.catch_warnings(): # boto3 is notoriously annoying. - warnings.simplefilter("ignore") - start_at = app.config.get('KINESIS_START_AT') - start_type = app.config.get('KINESIS_START_TYPE') - if not start_type: - start_type = 'AT_TIMESTAMP' - if start_type == 'AT_TIMESTAMP' and not start_at: - start_at = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') - - processor = MetadataRecordProcessor( - app.config['KINESIS_STREAM'], - app.config['KINESIS_SHARD_ID'], - app.config['AWS_ACCESS_KEY_ID'], - app.config['AWS_SECRET_ACCESS_KEY'], - app.config['AWS_REGION'], - CheckpointManager( - app.config['KINESIS_CHECKPOINT_VOLUME'], - app.config['KINESIS_STREAM'], - app.config['KINESIS_SHARD_ID'], - ), - endpoint=app.config.get('KINESIS_ENDPOINT', None), - verify=app.config.get('KINESIS_VERIFY', 'true') == 'true', - duration=duration, - start_type=start_type, - start_at=start_at - ) - processor.go() + agent.process_stream(MetadataRecordProcessor, app.config, + duration=duration) diff --git a/search/agent/base.py b/search/agent/base.py deleted file mode 100644 index acbd5585..00000000 --- a/search/agent/base.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -Provides a base class for Kinesis record handling. - -.. _todo: This should move to arXiv-base, per ARXIVNG-281. -""" - -import time -import json -from datetime import datetime, timedelta -import os -from typing import Any, Optional, Tuple, Generator, Callable, Dict, Union -from contextlib import contextmanager -import signal - -import boto3 -from botocore.exceptions import WaiterError, NoCredentialsError, \ - PartialCredentialsError, BotoCoreError, ClientError - -from arxiv.base import logging -logger = logging.getLogger(__name__) -logger.propagate = False - -NOW = datetime.now().strftime('%Y-%m-%dT%H:%M:%S') - - -class CheckpointError(RuntimeError): - """Checkpointing failed.""" - - -class StreamNotAvailable(RuntimeError): - """Could not find or connect to the stream.""" - - -class KinesisRequestFailed(RuntimeError): - """Raised when a Kinesis request failed permanently.""" - - -class StopProcessing(RuntimeError): - """Gracefully stopped processing upon unrecoverable error.""" - - -class ConfigurationError(RuntimeError): - """There was a problem with the configuration.""" - - -def retry(retries: int = 5, wait: int = 5) -> Callable: - """ - Decorator factory for retrying Kinesis calls. - - Parameters - ---------- - retries : int - Number of times to retry before failing. - wait : int - Number of seconds to wait between retries. - - Returns - ------- - function - A decorator that retries the decorated func ``retries`` times before - raising :class:`.KinesisRequestFailed`. - - """ - __retries = retries - - def decorator(func: Callable) -> Callable: - """Retry the decorated func on ClientErrors up to ``retries`` times.""" - _retries = __retries - - def inner(*args, **kwargs) -> Any: # type: ignore - retries = _retries - while retries > 0: - try: - return func(*args, **kwargs) - except ClientError as e: - code = e.response['Error']['Code'] - logger.error('Caught ClientError %s, retrying', code) - time.sleep(wait) - retries -= 1 - raise KinesisRequestFailed('Max retries; last code: {code}') - return inner - return decorator - - -class CheckpointManager(object): - """Provides on-disk loading and updating of consumer checkpoints.""" - - def __init__(self, base_path: str, stream_name: str, shard_id: str) \ - -> None: - """Load or create a new file for checkpointing.""" - if not os.path.exists(base_path): - raise ValueError(f'Path does not exist: {base_path}') - self.file_path = os.path.join(base_path, - f'{stream_name}__{shard_id}.json') - if not os.path.exists(self.file_path): - try: - with open(self.file_path, 'w') as f: - f.write('') - except Exception as e: # The containing path doesn't exist. - raise ValueError(f'Could not use {self.file_path}') from e - - with open(self.file_path) as f: - position = f.read() - self.position = position if position else None - - def checkpoint(self, position: str) -> None: - """Checkpoint at ``position``.""" - try: - with open(self.file_path, 'w') as f: - f.write(position) - self.position = position - except Exception as e: - raise CheckpointError('Could not checkpoint') from e - - -class BaseConsumer(object): - """ - Kinesis stream consumer. - - Consumes a single shard from a single stream, and checkpoints on disk - (to reduce external dependencies). - """ - - def __init__(self, stream_name: str = '', shard_id: str = '', - access_key: str = '', secret_key: str = '', region: str = '', - checkpointer: Optional[CheckpointManager] = None, - back_off: int = 5, batch_size: int = 50, - endpoint: Optional[str] = None, verify: bool = True, - duration: Optional[int] = None, - start_type: str = 'AT_TIMESTAMP', - start_at: str = NOW) -> None: - """Initialize a new stream consumer.""" - logger.info(f'New consumer for {stream_name} ({shard_id})') - self.stream_name = stream_name - self.shard_id = shard_id - self.checkpointer = checkpointer - if self.checkpointer: - self.position = self.checkpointer.position - else: - self.position = None - self.duration = duration - self.start_time = None - self.back_off = back_off - self.batch_size = batch_size - self.sleep_time = 5 - self.start_at = start_at - self.start_type = start_type - logger.info(f'Got start_type={start_type} and start_at={start_at}') - - if not self.stream_name or not self.shard_id: - logger.info( - 'No stream indicated; making no attempt to connect to Kinesis' - ) - return - - logger.info(f'Getting a new connection to Kinesis at {endpoint}' - f' in region {region}, with SSL verification={verify}') - self.client = boto3.client('kinesis', - aws_access_key_id=access_key, - aws_secret_access_key=secret_key, - endpoint_url=endpoint, - verify=verify, - region_name=region) - - logger.info(f'Waiting for {self.stream_name} to be available') - try: - self.wait_for_stream() - except (KinesisRequestFailed, StreamNotAvailable): - logger.info('Could not connect to stream; attempting to create') - self.client.create_stream( - StreamName=self.stream_name, - ShardCount=1 - ) - logger.info(f'Created; waiting for {self.stream_name} again') - self.wait_for_stream() - - # Intercept SIGINT and SIGTERM so that we can checkpoint before exit. - self.exit = False - signal.signal(signal.SIGINT, self.stop) - signal.signal(signal.SIGTERM, self.stop) - logger.info('Ready to start') - - def stop(self, signal: int, frame: Any) -> None: - """Set exit flag for a graceful stop.""" - logger.error(f'Received signal {signal}') - self._checkpoint() - logger.error('Done') - raise StopProcessing(f'Received signal {signal}') - - @retry(5, 10) - def wait_for_stream(self) -> None: - """ - Wait for the stream to become available. - - If the stream becomes available, returns ``None``. Otherwise, raises - a :class:`.StreamNotAvailable` exception. - - Raises - ------ - :class:`.StreamNotAvailable` - Raised when the stream could not be reached. - - """ - waiter = self.client.get_waiter('stream_exists') - try: - logger.error(f'Waiting for stream {self.stream_name}') - waiter.wait( - StreamName=self.stream_name, - Limit=1, - ExclusiveStartShardId=self.shard_id - ) - except WaiterError as e: - logger.error('Failed to get stream while waiting') - raise StreamNotAvailable('Could not connect to stream') from e - except (PartialCredentialsError, NoCredentialsError) as e: - logger.error('Credentials missing or incomplete: %s', e.msg) - raise ConfigurationError('Credentials missing') from e - - def _get_iterator(self) -> str: - """ - Get a new shard iterator. - - If our position is set, we will start with the record immediately after - that position. Otherwise, we start at ``start_at`` timestamp. - - Returns - ------- - str - The sequence ID of the record on which to start. - - """ - params: Dict[str, Any] = dict( - StreamName=self.stream_name, - ShardId=self.shard_id - ) - if self.position: - params.update(dict( - ShardIteratorType='AFTER_SEQUENCE_NUMBER', - StartingSequenceNumber=self.position - )) - elif self.start_type == 'AT_TIMESTAMP' and self.start_at: - start_at = datetime.strptime(self.start_at, '%Y-%m-%dT%H:%M:%S') - params.update(dict( - ShardIteratorType='AT_TIMESTAMP', - Timestamp=( - start_at - datetime.utcfromtimestamp(0) - ).total_seconds() - )) - elif self.start_type == 'TRIM_HORIZON': - params.update(dict(ShardIteratorType='TRIM_HORIZON')) - try: - it: str = self.client.get_shard_iterator(**params)['ShardIterator'] - return it - except self.client.exceptions.InvalidArgumentException as e: - logger.info('Got InvalidArgumentException: %s', str(e)) - # Iterator may not have come from this stream/shard. - if self.position is not None: - self.position = None - return self._get_iterator() - raise KinesisRequestFailed('Could not get shard iterator') - - def _checkpoint(self) -> None: - """ - Checkpoint at the current position. - - The current position is the sequence number of the last record that was - successfully processed. - """ - if self.position is not None and self.checkpointer: - self.checkpointer.checkpoint(self.position) - logger.debug(f'Set checkpoint at {self.position}') - - @retry(retries=10, wait=5) - def get_records(self, iterator: str, limit: int) -> Tuple[str, dict]: - """Get the next batch of ``limit`` or fewer records.""" - logger.debug(f'Get more records from {iterator}, limit {limit}') - response = self.client.get_records(ShardIterator=iterator, - Limit=limit) - iterator = response['NextShardIterator'] - return iterator, response - - def _check_timeout(self) -> None: - """If a processing duration is set, exit if duration is exceeded.""" - if not self.start_time or not self.duration: - return - running_for = time.time() - self.start_time - if running_for > self.duration: - logger.info(f'Ran for {running_for} seconds; exiting') - self._checkpoint() - raise StopProcessing(f'Ran for {running_for} seconds; exiting') - - def process_records(self, start: str) -> Tuple[str, int]: - """Retrieve and process records starting at ``start``.""" - logger.debug(f'Get more records, starting at {start}') - processed = 0 - try: - time.sleep(self.sleep_time) # Don't get carried away. - next_start, response = self.get_records(start, self.batch_size) - except Exception as e: - self._checkpoint() - raise StopProcessing('Unhandled exception: %s' % str(e)) from e - - logger.debug('Got %i records', len(response['Records'])) - for record in response['Records']: - self._check_timeout() - - # It is possible that Kinesis will replay the same message several - # times, especially at the end of the stream. There's no point in - # replaying the message, so we'll continue on. - if record['SequenceNumber'] == self.position: - continue - - self.process_record(record) - processed += 1 - - # Setting the position means that we have successfully - # processed this record. - if record['SequenceNumber']: # Make sure it's set. - self.position = record['SequenceNumber'] - logger.debug(f'Updated position to {self.position}') - logger.debug(f'Next start is {next_start}') - return next_start, processed - - def go(self) -> None: - """Main processing routine.""" - self.start_time = time.time() - logger.info(f'Starting processing from position {self.position}' - f' on stream {self.stream_name} and shard {self.shard_id}') - - start = self._get_iterator() - while True: - start, processed = self.process_records(start) - if processed > 0: - self._checkpoint() # Checkpoint after every batch. - if start is None: # Shard is closed. - logger.error('Shard closed unexpectedly; no new iterator') - self._checkpoint() - raise StopProcessing('Could not get a new iterator') - self._check_timeout() - - def process_record(self, record: dict) -> None: - """ - Process a single record from the stream. - - Parameters - ---------- - record : dict - - """ - logger.info(f'Processing record {record["SequenceNumber"]}') - logger.debug(f'Process record {record}') - # raise NotImplementedError('Should be implemented by a subclass') diff --git a/search/agent/consumer.py b/search/agent/consumer.py index c83a569d..5c0bc60e 100644 --- a/search/agent/consumer.py +++ b/search/agent/consumer.py @@ -1,14 +1,13 @@ """Provides a record processor for MetadataIsAvailable notifications.""" -from typing import Dict, List import json import os -from typing import List, Any, Optional +from typing import List, Any, Optional, Dict from arxiv.base import logging from search.services import metadata, index from search.process import transform from search.domain import DocMeta, Document, asdict -from .base import BaseConsumer +from arxiv.base.agent import BaseConsumer logger = logging.getLogger(__name__) logger.propagate = False diff --git a/search/agent/tests/test_base_consumer.py b/search/agent/tests/test_base_consumer.py deleted file mode 100644 index ede647e5..00000000 --- a/search/agent/tests/test_base_consumer.py +++ /dev/null @@ -1,166 +0,0 @@ -"""Tests for :class:`.BaseConsumer`.""" - -from unittest import TestCase, mock -from botocore.exceptions import BotoCoreError, WaiterError, ClientError - -from search.agent.base import BaseConsumer, StreamNotAvailable, StopProcessing - - -class TestBaseConsumer(TestCase): - """Test :class:`.BaseConsumer` behavior and public methods.""" - - def setUp(self): - self.checkpointer = mock.MagicMock() - self.checkpointer.position = None - - @mock.patch('boto3.client') - def test_init(self, mock_client_factory): - """On init, consumer should wait for stream to be available.""" - mock_client = mock.MagicMock() - mock_waiter = mock.MagicMock() - mock_client.get_waiter.return_value = mock_waiter - mock_client_factory.return_value = mock_client - - try: - BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', 'us-east-1', - self.checkpointer) - except Exception as e: - self.fail('If the waiter returns without an exception, no' - ' exception should be raised.') - self.assertEqual(mock_waiter.wait.call_count, 1, - "A boto3 waiter should be used") - - @mock.patch('boto3.client') - def test_init_stream_not_available(self, mock_client_factory): - """If the stream is not available, should raise an exception.""" - mock_client = mock.MagicMock() - mock_waiter = mock.MagicMock() - - def raise_waiter_error(*a, **k): - raise WaiterError('', {}, {}) - - mock_waiter.wait.side_effect = raise_waiter_error - mock_client.get_waiter.return_value = mock_waiter - mock_client_factory.return_value = mock_client - with self.assertRaises(StreamNotAvailable): - BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', 'us-east-1', - self.checkpointer) - - @mock.patch('boto3.client') - def test_iteration(self, mock_client_factory): - """Test iteration behavior.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_records.return_value = { - 'Records': [ - {'SequenceNumber': str(i)} for i in range(10) - ], - 'NextShardIterator': '10' - } - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer) - next_start, processed = consumer.process_records('0') - self.assertGreater(mock_client.get_records.call_count, 0) - self.assertEqual(processed, 10) - self.assertEqual(next_start, '10', "Should return NextShardIterator") - - @mock.patch('boto3.client') - def test_process_records_until_shard_closes(self, mock_client_factory): - """Should call GetRecords until no next iterator is available.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - def get_records(**kwargs): - start = int(kwargs['ShardIterator']) - end = start + int(kwargs['Limit']) - if start > 500: - return {'Records': [], 'NextShardIterator': None} - return { - 'Records': [ - {'SequenceNumber': str(i)} for i in range(start, end) - ], - 'NextShardIterator': str(end + 1) - } - - mock_client.get_records.side_effect = get_records - - batch_size = 50 - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - batch_size=batch_size) - with self.assertRaises(StopProcessing): - consumer.go() - self.assertEqual(mock_client.get_records.call_count, - (500/batch_size) + 1, - "Should call Kinesis GetRecords until no iterator" - " is returned.") - - @mock.patch('boto3.client') - def test_process_records_with_clienterror(self, mock_client_factory): - """Should try to checkpoint before exiting.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - def raise_client_error(*args, **kwargs): - raise ClientError({'Error': {'Code': 'foo'}}, {}) - - mock_client.get_records.side_effect = raise_client_error - - batch_size = 50 - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - batch_size=batch_size) - consumer.position = 'fooposition' - try: - consumer.go() - except Exception: - pass - self.assertEqual(self.checkpointer.checkpoint.call_count, 1) - - @mock.patch('boto3.client') - def test_start_from_timestamp(self, mock_client_factory): - """Consumer is initialized with start_type 'AT_TIMESTAMP'.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - start_type='AT_TIMESTAMP') - consumer._get_iterator() - args, kwargs = mock_client.get_shard_iterator.call_args - self.assertEqual(kwargs['ShardIteratorType'], 'AT_TIMESTAMP') - self.assertIn('Timestamp', kwargs) - - @mock.patch('boto3.client') - def test_start_from_position(self, mock_client_factory): - """Consumer is initialized with start_type 'AT_TIMESTAMP'.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - start_type='AT_TIMESTAMP') - consumer.position = 'fooposition' - consumer._get_iterator() - args, kwargs = mock_client.get_shard_iterator.call_args - self.assertEqual(kwargs['ShardIteratorType'], 'AFTER_SEQUENCE_NUMBER') - self.assertEqual(kwargs['StartingSequenceNumber'], 'fooposition') - - @mock.patch('boto3.client') - def test_start_from_trim_horizon(self, mock_client_factory): - """Consumer is initialized with start_type 'AT_TIMESTAMP'.""" - mock_client = mock.MagicMock() - mock_client_factory.return_value = mock_client - mock_client.get_shard_iterator.return_value = {'ShardIterator': '1'} - - consumer = BaseConsumer('foo', '1', 'a1b2c3d4', 'qwertyuiop', - 'us-east-1', self.checkpointer, - start_type='TRIM_HORIZON') - consumer._get_iterator() - args, kwargs = mock_client.get_shard_iterator.call_args - self.assertEqual(kwargs['ShardIteratorType'], 'TRIM_HORIZON') - self.assertNotIn('StartingSequenceNumber', kwargs) diff --git a/search/agent/tests/tests.py b/search/agent/tests/test_integration.py similarity index 99% rename from search/agent/tests/tests.py rename to search/agent/tests/test_integration.py index 9ad22496..9fed6ca3 100644 --- a/search/agent/tests/tests.py +++ b/search/agent/tests/test_integration.py @@ -10,7 +10,7 @@ import threading from search.agent import process_stream -from search.agent.base import StopProcessing +from arxiv.base.agent import StopProcessing from search.services import metadata from search.domain import DocMeta from search.factory import create_ui_web_app From 4dfc7b55203c93608ef1f78997ef4f025fa84667 Mon Sep 17 00:00:00 2001 From: erickpeirson Date: Sat, 8 Sep 2018 07:15:08 -0400 Subject: [PATCH 02/77] ARXIVNG-1177 prototype API --- api.py | 5 ++ search/controllers/api/__init__.py | 90 +++++++++++++++++++++++++++ search/controllers/simple/forms.py | 19 +----- search/domain/__init__.py | 1 + search/domain/api.py | 22 +++++++ search/domain/base.py | 18 ++++++ search/encode.py | 23 +++++++ search/factory.py | 24 ++++++- search/routes/api.py | 29 +++++++++ search/services/index/__init__.py | 19 +++--- search/services/index/highlighting.py | 2 + search/services/index/results.py | 23 +++++-- 12 files changed, 242 insertions(+), 33 deletions(-) create mode 100644 api.py create mode 100644 search/controllers/api/__init__.py create mode 100644 search/domain/api.py create mode 100644 search/encode.py create mode 100644 search/routes/api.py diff --git a/api.py b/api.py new file mode 100644 index 00000000..c51ea39e --- /dev/null +++ b/api.py @@ -0,0 +1,5 @@ +"""Provides application for development purposes.""" + +from search.factory import create_api_web_app + +app = create_api_web_app() diff --git a/search/controllers/api/__init__.py b/search/controllers/api/__init__.py new file mode 100644 index 00000000..85fff358 --- /dev/null +++ b/search/controllers/api/__init__.py @@ -0,0 +1,90 @@ +"""Controller for search API requests.""" + +from typing import Tuple, Dict, Any, Optional +import re +from datetime import date, datetime +from dateutil.relativedelta import relativedelta +import dateutil.parser +from pytz import timezone +import pytz + + +from werkzeug.datastructures import MultiDict, ImmutableMultiDict +from werkzeug.exceptions import InternalServerError, BadRequest, NotFound +from flask import url_for + +from arxiv import status, taxonomy + +from search.services import index, fulltext, metadata +from search.controllers.util import paginate +from ...domain import Query, APIQuery, FieldedSearchList, FieldedSearchTerm, \ + DateRange, ClassificationList, Classification, asdict + +Response = Tuple[Dict[str, Any], int, Dict[str, Any]] +EASTERN = timezone('US/Eastern') + + +def _get_fielded_terms(params: MultiDict) -> Optional[FieldedSearchList]: + terms = FieldedSearchList() + for field, _ in Query.SUPPORTED_FIELDS: + values = params.getlist(field) + for value in values: + terms.append( + FieldedSearchTerm(operator='AND', field=field, term=value) + ) + if len(terms) == 0: + return + return terms + + +def _get_date_params(params: MultiDict) -> Optional[DateRange]: + date_params = {} + for field in ['start_date', 'end_date', 'date_type']: + value = params.getlist(field) + if not value: + continue + try: + dt = dateutil.parser.parse(value[0]) + if not dt.tzinfo: + dt = pytz.utc.localize(dt) + dt = dt.replace(tzinfo=EASTERN) + except ValueError: + raise BadRequest({'field': field, 'reason': 'invalid datetime'}) + date_params[field] = dt + if date_params: + return DateRange(**date_params) + return + + +def _get_classifications(params: MultiDict) -> Optional[ClassificationList]: + classifications = ClassificationList() + for value in params.getlist('primary_classification'): + if value not in taxonomy.ARCHIVES: + raise BadRequest({ + 'field': 'primary_classification', + 'reason': 'not a valid archive' + }) + classifications.append( + Classification(archive=value) + ) + if len(classifications) == 0: + return + return classifications + + +def search(params: MultiDict) -> Response: + """Handle a search request from the API.""" + q = APIQuery() + terms = _get_fielded_terms(params) + if terms is not None: + q.terms = terms + date_range = _get_date_params(params) + if date_range is not None: + q.date_range = date_range + + classifications = _get_classifications(params) + if classifications is not None: + q.primary_classification = classifications + + q = paginate(q, params) + return asdict(index.search(q, highlight=False)), 200, {} diff --git a/search/controllers/simple/forms.py b/search/controllers/simple/forms.py index 9e60163a..2d5b6049 100644 --- a/search/controllers/simple/forms.py +++ b/search/controllers/simple/forms.py @@ -9,28 +9,13 @@ from search.controllers.util import does_not_start_with_wildcard, \ has_balanced_quotes, strip_white_space +from ...domain import Query class SimpleSearchForm(Form): """Provides a simple field-query search form.""" - searchtype = SelectField("Field", choices=[ - ('all', 'All fields'), - ('title', 'Title'), - ('author', 'Author(s)'), - ('abstract', 'Abstract'), - ('comments', 'Comments'), - ('journal_ref', 'Journal reference'), - ('acm_class', 'ACM classification'), - ('msc_class', 'MSC classification'), - ('report_num', 'Report number'), - ('paper_id', 'arXiv identifier'), - ('doi', 'DOI'), - ('orcid', 'ORCID'), - ('author_id', 'arXiv author ID'), - ('help', 'Help pages'), - ('full_text', 'Full text') - ]) + searchtype = SelectField("Field", choices=Query.SUPPORTED_FIELDS) query = StringField('Search or Article ID', filters=[strip_white_space], validators=[does_not_start_with_wildcard, diff --git a/search/domain/__init__.py b/search/domain/__init__.py index 8e3e3512..9247c384 100644 --- a/search/domain/__init__.py +++ b/search/domain/__init__.py @@ -11,3 +11,4 @@ # pylint: disable=wildcard-import from .base import * from .advanced import * +from .api import * diff --git a/search/domain/api.py b/search/domain/api.py new file mode 100644 index 00000000..9de39eb1 --- /dev/null +++ b/search/domain/api.py @@ -0,0 +1,22 @@ +"""API-specific domain classes.""" + +from .base import DateRange, Query, ClassificationList +from .advanced import FieldedSearchList, FieldedSearchTerm + +from dataclasses import dataclass, field +from typing import NamedTuple, Optional + + +@dataclass +class APIQuery(Query): + """ + Represents an API query. + + Similar to an advanced query. + """ + + date_range: Optional[DateRange] = None + primary_classification: ClassificationList = field( + default_factory=ClassificationList + ) + terms: FieldedSearchList = field(default_factory=FieldedSearchList) diff --git a/search/domain/base.py b/search/domain/base.py index 8a90a1eb..97edddba 100644 --- a/search/domain/base.py +++ b/search/domain/base.py @@ -125,6 +125,24 @@ def __str__(self) -> str: class Query: """Represents a search query originating from the UI or API.""" + SUPPORTED_FIELDS = [ + ('all', 'All fields'), + ('title', 'Title'), + ('author', 'Author(s)'), + ('abstract', 'Abstract'), + ('comments', 'Comments'), + ('journal_ref', 'Journal reference'), + ('acm_class', 'ACM classification'), + ('msc_class', 'MSC classification'), + ('report_num', 'Report number'), + ('paper_id', 'arXiv identifier'), + ('doi', 'DOI'), + ('orcid', 'ORCID'), + ('author_id', 'arXiv author ID'), + ('help', 'Help pages'), + ('full_text', 'Full text') + ] + order: Optional[str] = field(default=None) page_size: int = field(default=50) page_start: int = field(default=0) diff --git a/search/encode.py b/search/encode.py new file mode 100644 index 00000000..6a17ee3e --- /dev/null +++ b/search/encode.py @@ -0,0 +1,23 @@ +"""Utilities for response encoding/serialization.""" + +from datetime import date, datetime + +from flask.json import JSONEncoder + +from typing import Any, List, Union + + +class ISO8601JSONEncoder(JSONEncoder): + """Renders date and datetime objects as ISO8601 datetime strings.""" + + def default(self, obj: Any) -> Union[str, List[Any]]: + """Overriden to render date(time)s in isoformat.""" + try: + if isinstance(obj, (date, datetime)): + return obj.isoformat() + iterable = iter(obj) + except TypeError: + pass + else: + return list(iterable) + return JSONEncoder.default(self, obj) #type: ignore diff --git a/search/factory.py b/search/factory.py index db5ebebd..b4bcda9e 100644 --- a/search/factory.py +++ b/search/factory.py @@ -7,9 +7,10 @@ from arxiv.base import Base from arxiv.base.middleware import wrap, request_logs -from search.routes import ui +from search.routes import ui, api from search.services import index from search.converters import ArchiveConverter +from search.encode import ISO8601JSONEncoder s3 = FlaskS3() @@ -34,3 +35,24 @@ def create_ui_web_app() -> Flask: wrap(app, [request_logs.ClassicLogsMiddleware]) return app + + +def create_api_web_app() -> Flask: + """Initialize an instance of the search frontend UI web application.""" + logging.getLogger('boto').setLevel(logging.ERROR) + logging.getLogger('boto3').setLevel(logging.ERROR) + logging.getLogger('botocore').setLevel(logging.ERROR) + + app = Flask('search') + app.json_encoder = ISO8601JSONEncoder + app.config.from_pyfile('config.py') + + + index.init_app(app) + + Base(app) + app.register_blueprint(api.blueprint) + + wrap(app, [request_logs.ClassicLogsMiddleware]) + + return app diff --git a/search/routes/api.py b/search/routes/api.py new file mode 100644 index 00000000..86e1f7ae --- /dev/null +++ b/search/routes/api.py @@ -0,0 +1,29 @@ +import json +from typing import Dict, Callable, Union, Any, Optional, List +from functools import wraps +from urllib.parse import urljoin, urlparse, parse_qs, urlencode, urlunparse + +from flask.json import jsonify +from flask import Blueprint, render_template, redirect, request, Response, \ + url_for +from werkzeug.urls import Href, url_encode, url_parse, url_unparse, url_encode +from werkzeug.datastructures import MultiDict, ImmutableMultiDict + +from arxiv import status +from arxiv.base import logging +from werkzeug.exceptions import InternalServerError +from search.controllers import api + +# from arxiv.users.auth.decorators import scoped +# from arxiv.users.auth import scopes + +logger = logging.getLogger(__name__) + +blueprint = Blueprint('api', __name__, url_prefix='/') + + +# @scoped(scopes.READ_API) +@blueprint.route('/', methods=['GET']) +def search() -> Response: + data, status, headers = api.search(request.args) + return jsonify(data), status, headers diff --git a/search/services/index/__init__.py b/search/services/index/__init__.py index 58f00672..cee1ea40 100644 --- a/search/services/index/__init__.py +++ b/search/services/index/__init__.py @@ -33,7 +33,7 @@ from search.context import get_application_config, get_application_global from arxiv.base import logging from search.domain import Document, DocumentSet, Query, AdvancedQuery, \ - SimpleQuery, asdict + SimpleQuery, asdict, APIQuery from .exceptions import QueryError, IndexConnectionError, DocumentNotFound, \ IndexingError, OutsideAllowedRange, MappingError @@ -358,7 +358,7 @@ def get_document(self, document_id: int) -> Document: return Document(**record['_source']) # type: ignore # See https://github.com/python/mypy/issues/3937 - def search(self, query: Query) -> DocumentSet: + def search(self, query: Query, highlight: bool = True) -> DocumentSet: """ Perform a search. @@ -389,7 +389,7 @@ def search(self, query: Query) -> DocumentSet: logger.debug('got current search request %s', str(query)) current_search = self._base_search() try: - if isinstance(query, AdvancedQuery): + if isinstance(query, AdvancedQuery) or isinstance(query, APIQuery): current_search = advanced_search(current_search, query) elif isinstance(query, SimpleQuery): current_search = simple_search(current_search, query) @@ -397,16 +397,17 @@ def search(self, query: Query) -> DocumentSet: logger.error('Malformed query: %s', str(e)) raise QueryError('Malformed query') from e - # Highlighting is performed by Elasticsearch; here we include the - # fields and configuration for highlighting. - current_search = highlight(current_search) + if highlight: + # Highlighting is performed by Elasticsearch; here we include the + # fields and configuration for highlighting. + current_search = highlight(current_search) with handle_es_exceptions(): # Slicing the search adds pagination parameters to the request. resp = current_search[query.page_start:query.page_end].execute() # Perform post-processing on the search results. - return results.to_documentset(query, resp) + return results.to_documentset(query, resp, highlight=highlight) def exists(self, paper_id_v: str) -> bool: """Determine whether a paper exists in the index.""" @@ -456,9 +457,9 @@ def current_session() -> SearchSession: @wraps(SearchSession.search) -def search(query: Query) -> DocumentSet: +def search(query: Query, highlight: bool = True) -> DocumentSet: """Retrieve search results.""" - return current_session().search(query) + return current_session().search(query, highlight=highlight) @wraps(SearchSession.add_document) diff --git a/search/services/index/highlighting.py b/search/services/index/highlighting.py index c9f4a504..82ec7c4b 100644 --- a/search/services/index/highlighting.py +++ b/search/services/index/highlighting.py @@ -174,6 +174,8 @@ def add_highlighting(result: dict, raw: Response) -> dict: # them together. Note that dir(None) won't return anything, so this block # is skipped if there are no highlights from ES. for field in dir(highlighted_fields): + if field.startswith('_'): + continue value = getattr(highlighted_fields, field) if hasattr(value, '__iter__'): value = '…'.join(value) diff --git a/search/services/index/results.py b/search/services/index/results.py index 27174c75..88f89db5 100644 --- a/search/services/index/results.py +++ b/search/services/index/results.py @@ -10,6 +10,7 @@ from typing import Any, Dict from elasticsearch_dsl.response import Response +from elasticsearch_dsl.utils import AttrList, AttrDict from search.domain import Document, Query, DocumentSet from arxiv.base import logging @@ -20,7 +21,7 @@ logger.propagate = False -def _to_document(raw: Response) -> Document: +def _to_document(raw: Response, highlight: bool = True) -> Document: """Transform an ES search result back into a :class:`.Document`.""" # typing: ignore result: Dict[str, Any] = {} @@ -31,6 +32,14 @@ def _to_document(raw: Response) -> Document: if not hasattr(raw, key): continue value = getattr(raw, key) + + # We want to prevent ES-specific data types from escaping the module + # API. + if isinstance(value, AttrList): + value = value._l_ + elif isinstance(value, AttrDict): + value = value.to_dict() + if key == 'announced_date_first' and value and isinstance(value, str): value = datetime.strptime(value, '%Y-%m').date() if key in ['submitted_date', 'submitted_date_first', @@ -47,18 +56,20 @@ def _to_document(raw: Response) -> Document: result[key] = value result['score'] = raw.meta.score - if type(result['abstract']) is str: + if type(result['abstract']) is str and highlight: result['preview']['abstract'] = preview(result['abstract']) if result['preview']['abstract'].endswith('…'): result['truncated']['abstract'] = True - logger.debug('%s: add highlighting to result', raw.paper_id) - result = add_highlighting(result, raw) + if highlight: + logger.debug('%s: add highlighting to result', raw.paper_id) + result = add_highlighting(result, raw) return Document(**result) # type: ignore # See https://github.com/python/mypy/issues/3937 -def to_documentset(query: Query, response: Response) -> DocumentSet: +def to_documentset(query: Query, response: Response, highlight: bool = True) \ + -> DocumentSet: """ Transform a response from ES to a :class:`.DocumentSet`. @@ -93,6 +104,6 @@ def to_documentset(query: Query, response: Response) -> DocumentSet: 'page_size': query.page_size, 'max_pages': max_pages }, - 'results': [_to_document(raw) for raw in response] + 'results': [_to_document(raw, highlight=highlight) for raw in response] }) # See https://github.com/python/mypy/issues/3937 From 65c471159f1f3bd615f6dadc118a4c9439e0a0d2 Mon Sep 17 00:00:00 2001 From: erickpeirson Date: Sat, 8 Sep 2018 07:21:30 -0400 Subject: [PATCH 03/77] set max page size --- search/controllers/api/__init__.py | 52 +++++++++++++++++++----------- search/controllers/util.py | 4 +-- search/domain/base.py | 3 ++ search/services/index/__init__.py | 4 +-- 4 files changed, 41 insertions(+), 22 deletions(-) diff --git a/search/controllers/api/__init__.py b/search/controllers/api/__init__.py index 85fff358..851032f7 100644 --- a/search/controllers/api/__init__.py +++ b/search/controllers/api/__init__.py @@ -24,6 +24,40 @@ EASTERN = timezone('US/Eastern') +def search(params: MultiDict) -> Response: + """ + Handle a search request from the API. + + Parameters + ---------- + params : :class:`MultiDict` + GET query parameters from the request. + + Returns + ------- + dict + Response data (to serialize). + int + HTTP status code. + dict + Extra headers for the response. + """ + q = APIQuery() + terms = _get_fielded_terms(params) + if terms is not None: + q.terms = terms + date_range = _get_date_params(params) + if date_range is not None: + q.date_range = date_range + + classifications = _get_classifications(params) + if classifications is not None: + q.primary_classification = classifications + + q = paginate(q, params) + return asdict(index.search(q, highlight=False)), status.HTTP_200_OK, {} + + def _get_fielded_terms(params: MultiDict) -> Optional[FieldedSearchList]: terms = FieldedSearchList() for field, _ in Query.SUPPORTED_FIELDS: @@ -70,21 +104,3 @@ def _get_classifications(params: MultiDict) -> Optional[ClassificationList]: if len(classifications) == 0: return return classifications - - -def search(params: MultiDict) -> Response: - """Handle a search request from the API.""" - q = APIQuery() - terms = _get_fielded_terms(params) - if terms is not None: - q.terms = terms - date_range = _get_date_params(params) - if date_range is not None: - q.date_range = date_range - - classifications = _get_classifications(params) - if classifications is not None: - q.primary_classification = classifications - - q = paginate(q, params) - return asdict(index.search(q, highlight=False)), 200, {} diff --git a/search/controllers/util.py b/search/controllers/util.py index 9110ad3f..0444fc6f 100644 --- a/search/controllers/util.py +++ b/search/controllers/util.py @@ -52,8 +52,8 @@ def paginate(query: Query, data: dict) -> Query: :class:`.Query` """ - query.page_start = int(data.get('start', 0)) - query.page_size = int(data.get('size', 50)) + query.page_start = max(int(data.get('start', 0)), 0) + query.page_size = min(int(data.get('size', 50)), Query.MAXIMUM_PAGE_SIZE) return query diff --git a/search/domain/base.py b/search/domain/base.py index 97edddba..6c7cf658 100644 --- a/search/domain/base.py +++ b/search/domain/base.py @@ -125,6 +125,9 @@ def __str__(self) -> str: class Query: """Represents a search query originating from the UI or API.""" + MAXIMUM_PAGE_SIZE = 500 + """The maximum number of records that can be retrieved.""" + SUPPORTED_FIELDS = [ ('all', 'All fields'), ('title', 'Title'), diff --git a/search/services/index/__init__.py b/search/services/index/__init__.py index cee1ea40..262d89ac 100644 --- a/search/services/index/__init__.py +++ b/search/services/index/__init__.py @@ -40,7 +40,7 @@ from .util import MAX_RESULTS from .advanced import advanced_search from .simple import simple_search -from .highlighting import highlight +from . import highlighting from . import results logger = logging.getLogger(__name__) @@ -400,7 +400,7 @@ def search(self, query: Query, highlight: bool = True) -> DocumentSet: if highlight: # Highlighting is performed by Elasticsearch; here we include the # fields and configuration for highlighting. - current_search = highlight(current_search) + current_search = highlighting.highlight(current_search) with handle_es_exceptions(): # Slicing the search adds pagination parameters to the request. From 7e079d64eeb4209b3baed00d4a61b82651691cbc Mon Sep 17 00:00:00 2001 From: erickpeirson Date: Sat, 8 Sep 2018 07:55:29 -0400 Subject: [PATCH 04/77] ARXIVNG-1177 updating openapi and jsonschema --- requirements/dev.txt | 45 --- requirements/prod.txt | 45 --- requirements/test.txt | 6 - schema/DocumentSet.json | 41 --- schema/{ => resources}/Document.json | 36 +-- schema/resources/DocumentMetadata.json | 301 +++++++++++++++++++ schema/resources/DocumentSet.json | 42 +++ {api => schema}/search.yaml | 19 +- search/controllers/util.py | 2 +- search/domain/base.py | 8 +- search/routes/ui.py | 4 +- search/services/index/__init__.py | 8 +- search/services/index/results.py | 24 +- search/services/index/tests/tests.py | 8 +- search/templates/search/advanced_search.html | 4 +- search/templates/search/search-macros.html | 14 +- search/templates/search/search.html | 2 +- tests/integration/test_search_integration.py | 18 +- 18 files changed, 427 insertions(+), 200 deletions(-) delete mode 100644 requirements/dev.txt delete mode 100644 requirements/prod.txt delete mode 100644 requirements/test.txt delete mode 100644 schema/DocumentSet.json rename schema/{ => resources}/Document.json (90%) create mode 100644 schema/resources/DocumentMetadata.json create mode 100644 schema/resources/DocumentSet.json rename {api => schema}/search.yaml (80%) diff --git a/requirements/dev.txt b/requirements/dev.txt deleted file mode 100644 index 9a97fe49..00000000 --- a/requirements/dev.txt +++ /dev/null @@ -1,45 +0,0 @@ -amazon-kclpy==1.4.4 -arxiv-base==0.6.1 -boto==2.48.0 -boto3==1.6.6 -botocore==1.9.6 -certifi==2017.7.27.1 -chardet==3.0.4 -click==6.7 -coverage==4.4.2 -dataclasses==0.4 -docutils==0.14 -elasticsearch==6.1.1 -elasticsearch-dsl==6.1.0 -Flask==0.12.2 -Flask-S3==0.3.3 -idna==2.6 -ipaddress==1.0.19 -itsdangerous==0.24 -Jinja2==2.10 -jmespath==0.9.3 -jsonschema==2.6.0 -MarkupSafe==1.0 -mccabe==0.6.1 -mock==2.0.0 -mypy==0.560 -nose2==0.7.3 -pbr==3.1.1 -psutil==5.4.3 -pycodestyle==2.3.1 -pydocstyle==2.1.1 -pyflakes==1.6.0 -pylama==7.4.3 -python-dateutil==2.6.1 -pytz==2017.3 -requests==2.18.4 -s3transfer==0.1.13 -six==1.11.0 -snowballstemmer==1.2.1 -thrift==0.11.0 -thrift-connector==0.23 -typed-ast==1.1.0 -urllib3==1.22 -Werkzeug==0.13 -WTForms==2.1 -bleach==2.0.0 diff --git a/requirements/prod.txt b/requirements/prod.txt deleted file mode 100644 index 9a97fe49..00000000 --- a/requirements/prod.txt +++ /dev/null @@ -1,45 +0,0 @@ -amazon-kclpy==1.4.4 -arxiv-base==0.6.1 -boto==2.48.0 -boto3==1.6.6 -botocore==1.9.6 -certifi==2017.7.27.1 -chardet==3.0.4 -click==6.7 -coverage==4.4.2 -dataclasses==0.4 -docutils==0.14 -elasticsearch==6.1.1 -elasticsearch-dsl==6.1.0 -Flask==0.12.2 -Flask-S3==0.3.3 -idna==2.6 -ipaddress==1.0.19 -itsdangerous==0.24 -Jinja2==2.10 -jmespath==0.9.3 -jsonschema==2.6.0 -MarkupSafe==1.0 -mccabe==0.6.1 -mock==2.0.0 -mypy==0.560 -nose2==0.7.3 -pbr==3.1.1 -psutil==5.4.3 -pycodestyle==2.3.1 -pydocstyle==2.1.1 -pyflakes==1.6.0 -pylama==7.4.3 -python-dateutil==2.6.1 -pytz==2017.3 -requests==2.18.4 -s3transfer==0.1.13 -six==1.11.0 -snowballstemmer==1.2.1 -thrift==0.11.0 -thrift-connector==0.23 -typed-ast==1.1.0 -urllib3==1.22 -Werkzeug==0.13 -WTForms==2.1 -bleach==2.0.0 diff --git a/requirements/test.txt b/requirements/test.txt deleted file mode 100644 index 49bc87f5..00000000 --- a/requirements/test.txt +++ /dev/null @@ -1,6 +0,0 @@ -coverage==4.5 -mypy==0.570 -nose2==0.7.3 -pylint==1.8.2 -coveralls==1.3.0 -pydocstyle==2.1.1 diff --git a/schema/DocumentSet.json b/schema/DocumentSet.json deleted file mode 100644 index 49bd8354..00000000 --- a/schema/DocumentSet.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "$schema": "http://json-schema.org/schema#", - "title": "DocumentSet", - "description": "Prototype schema for arXiv documents", - "type": "object", - "required": ["metadata", "results"], - "properties": { - "metadata": { - "type": "object", - "query": { - "description": "The request query from which the document set is derived", - "type": "string" - }, - "total": { - "description": "Total number of results that respond to the query.", - "type": "integer", - "minimum": 0 - }, - "pagination": { - "description": "Pagination details", - "type": "object", - "properties": { - "next": { - "type": "string", - "format": "uri", - "description": "URI for the next page of results" - }, - "previous": { - "type": "string", - "format": "uri", - "description": "URI for the previous page of results" - } - } - } - }, - "results": { - "type": "object", - "$ref": "Document.json#Document" - } - } -} diff --git a/schema/Document.json b/schema/resources/Document.json similarity index 90% rename from schema/Document.json rename to schema/resources/Document.json index 74166186..d2327220 100644 --- a/schema/Document.json +++ b/schema/resources/Document.json @@ -1,7 +1,7 @@ { "$schema": "http://json-schema.org/schema#", "title": "Document", - "description": "Prototype schema for arXiv documents", + "description": "Schema for arXiv document metadata returned by the search API.", "definitions": { "category": { "type": "object", @@ -147,21 +147,21 @@ }, "minItems": 0 }, - "announced_first": { + "announced_date_first": { "desription": "Date (year and month) the paper was originally announced", "type": "string" - } - "updated_date": { - "description": "Date this version of paper was last touched", - "type": "string" }, "modified_date": { "description": "Date this version of paper .abs file was last modified", "type": "string" }, - "document_id": { - "description": "Internal, version-independent document identifier", - "type": "integer" + "paper_id": { + "description": "Version-independent arXiv paper identifier.", + "type": "string" + }, + "paper_id_v": { + "description": "arXiv paper identifier with version affix.", + "type": "string" }, "doi": { "type": "string" @@ -209,13 +209,6 @@ }, "uniqueItems": true }, - "paper_id": { - "type": "string" - }, - "paper_id_v": { - "description": "Paper ID with version suffix", - "type": "string" - }, "primary_classification": { "$ref": "#/definitions/classification" }, @@ -224,7 +217,7 @@ "items": {"$ref": "#/definitions/classification"} }, "proxy": { - "type": "string" + "type": "boolean" }, "report_num": { "type": "string" @@ -245,7 +238,7 @@ "required": ["size_bytes"] }, "submitter": { - "description": "Submitter data. Name and email may not match those associated with user account, since user data is copied to submission data at the time of submission creation.", + "description": "Submitter data. Name may not match those associated with user account, since user data is copied to submission data at the time of submission creation.", "type": "object", "properties": { "name": { @@ -261,24 +254,27 @@ "type": "string" } }, - "required": ["email", "name"] + "required": ["name"] }, "title": { "type": "string" }, "title_utf8": { - "description": "title field with texisms converted to utf-8 equivalents", + "description": "Title field with texisms converted to utf-8 equivalents", "type": "string" }, "version": { + "description": "The version number for this paper.", "minimum": 1, "type": "integer" }, "latest_version": { + "description": "Number of the latest version of this paper.", "minimum": 1, "type": "integer" }, "latest": { + "description": "arXiv paper identifier (with version affix) of latest version of this paper.", "type": "string" } }, diff --git a/schema/resources/DocumentMetadata.json b/schema/resources/DocumentMetadata.json new file mode 100644 index 00000000..84b791f5 --- /dev/null +++ b/schema/resources/DocumentMetadata.json @@ -0,0 +1,301 @@ +{ + "$schema": "http://json-schema.org/schema#", + "title": "DocumentMetadata", + "description": "Schema for arXiv document metadata provided by the docmeta endpoint.", + "definitions": { + "category": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": ["id", "name"] + }, + "archive": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": ["id", "name"] + }, + "group": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "name": { + "type": "string" + } + }, + "required": ["id", "name"] + }, + "classification": { + "type": "object", + "properties": { + "archive": {"$ref": "#/definitions/archive"}, + "group": {"$ref": "#/definitions/group"}, + "category": {"$ref": "#/definitions/category"} + } + } + }, + "type": "object", + "properties": { + "abs_categories": { + "description": "Categories as they would appear on the /abs page", + "type": "string" + }, + "abstract": { + "type": "string" + }, + "abstract_utf8": { + "description": "abstract field with texisms converted to utf-8 equivalents", + "type": "string" + }, + "acm_class": { + "description": "Classifications from ACM Computing Classification System", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "owners": { + "items": { + "type": "object", + "properties": { + "first_name": { + "type": "string" + }, + "last_name": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "affiliation": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 0 + }, + "orcid": { + "type": "string" + } + }, + "required": ["last_name"] + } + }, + "authors": { + "items": { + "type": "object", + "properties": { + "first_name": { + "type": "string" + }, + "last_name": { + "type": "string" + }, + "suffix": { + "type": "string" + }, + "author_id": { + "type": "string" + }, + "orcid": { + "type": "string" + }, + "affiliation": { + "type": "array", + "items": { + "type": "string" + }, + "minItems": 0 + } + }, + "required": ["last_name"] + }, + "minItems": 1, + "type": "array" + }, + "comments": { + "type": ["string"] + }, + "comments_utf8": { + "description": "comments field with texisms converted to utf-8 equivalents", + "type": ["string"] + }, + "submitted_date": { + "description": "Date this version of paper was submitted", + "type": "string" + }, + "submitted_date_all": { + "description": "Submitted dates of all of the versions of this paper", + "type": "array", + "items": { + "type": "string" + }, + "minItems": 0 + }, + "announced_first": { + "desription": "Date (year and month) the paper was originally announced", + "type": "string" + } + "updated_date": { + "description": "Date this version of paper was last touched", + "type": "string" + }, + "modified_date": { + "description": "Date this version of paper .abs file was last modified", + "type": "string" + }, + "document_id": { + "description": "Internal, version-independent document identifier", + "type": "integer" + }, + "doi": { + "type": "string" + }, + "formats": { + "description": "Derivative paper formats available to users", + "type": "array", + "minItems": 0 + }, + "fulltext": { + "type": "string" + }, + "is_current": { + "type": "boolean" + }, + "is_withdrawn": { + "type": "boolean" + }, + "journal_ref": { + "type": ["string"] + }, + "journal_ref_utf8": { + "description": "journal_ref field with texisms converted to utf-8 equivalents", + "type": ["string"] + }, + "license": { + "type": "object", + "properties": { + "uri": { + "type": ["string", "null"] + }, + "label": { + "type": ["string", "null"] + } + } + }, + "metadata_id": { + "type": "integer" + }, + "msc_class": { + "description": "Classifications from American Mathematical Society Mathematical Subject Classification (MSC)", + "type": "array", + "items": { + "type": "string" + }, + "uniqueItems": true + }, + "paper_id": { + "type": "string" + }, + "paper_id_v": { + "description": "Paper ID with version suffix", + "type": "string" + }, + "primary_classification": { + "$ref": "#/definitions/classification" + }, + "secondary_classification": { + "type": "array", + "items": {"$ref": "#/definitions/classification"} + }, + "proxy": { + "type": "string" + }, + "report_num": { + "type": "string" + }, + "source": { + "properties": { + "flags": { + "type": ["string", "null"] + }, + "format": { + "type": ["string", "null"] + }, + "size_bytes": { + "minimum": 0, + "type": "integer" + } + }, + "required": ["size_bytes"] + }, + "submitter": { + "description": "Submitter data. Name and email may not match those associated with user account, since user data is copied to submission data at the time of submission creation.", + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "is_author": { + "type": "boolean" + }, + "author_id": { + "type:": "string" + }, + "orcid": { + "type": "string" + } + }, + "required": ["email", "name"] + }, + "title": { + "type": "string" + }, + "title_utf8": { + "description": "title field with texisms converted to utf-8 equivalents", + "type": "string" + }, + "version": { + "minimum": 1, + "type": "integer" + }, + "latest_version": { + "minimum": 1, + "type": "integer" + }, + "latest": { + "type": "string" + } + }, + "required": [ + "abstract", + "authors", + "submitted_date", + "submitted_date_latest", + "submitted_date_first", + "is_current", + "is_withdrawn", + "license", + "paper_id", + "paper_id_v", + "primary_classification", + "title", + "source", + "version" + ] +} diff --git a/schema/resources/DocumentSet.json b/schema/resources/DocumentSet.json new file mode 100644 index 00000000..7cb584a8 --- /dev/null +++ b/schema/resources/DocumentSet.json @@ -0,0 +1,42 @@ +{ + "$schema": "http://json-schema.org/schema#", + "title": "DocumentSet", + "description": "A set of documents that respond to a query.", + "type": "object", + "required": ["metadata", "results"], + "properties": { + "metadata": { + "description": "Summary information about the search, including pagination.", + "properties": { + "start": { + "description": "Offset (zero-based) of first result in this documentset from start of original search results.", + "type": "integer" + }, + "end": { + "description": "Offset (zero-based) of last result in this documentset from start of original search results.", + "type": "integer" + }, + "total": { + "description": "Total number of documents that respond to this query.", + "type": "integer" + }, + "current_page": { + "description": "Estimated page number, based on start and size parameters.", + "type": "integer" + }, + "max_pages": { + "description": "Estimated last page in the entire search results, based on size and total parameters.", + "type": "integer" + }, + "total_pages": { + "description": "Estimated number of pages in the entire search results, based on size and total parameters.", + "type": "integer" + } + } + }, + "results": { + "type": "object", + "$ref": "Document.json#Document" + } + } +} diff --git a/api/search.yaml b/schema/search.yaml similarity index 80% rename from api/search.yaml rename to schema/search.yaml index b2e75ade..d676caf3 100644 --- a/api/search.yaml +++ b/schema/search.yaml @@ -12,13 +12,23 @@ info: servers: - url: https://arxiv.org/api paths: - /papers: + /: get: operationId: queryPapers description: | Returns all published arXiv papers that respond to the specified query parameters. By default, returns most recent papers first. parameters: + - name: all + in: query + description: | + Performs a query across all fields. Has the same behavior as the + simple search function on the main arXiv website. + required: false + style: form + explode: true + schema: + type: string - name: primary_category in: query description: | @@ -26,6 +36,7 @@ paths: should be limited. required: false style: form + explode: true schema: type: array items: @@ -36,14 +47,14 @@ paths: content: application/json: schema: - $ref: '../schema/DocumentSet.json#DocumentSet' + $ref: './resources/DocumentSet.json#DocumentSet' default: description: unexpected error content: application/json: schema: $ref: '#/components/schemas/Error' - /papers/{id}: + /{id}: get: description: Return metadata about an arXiv paper by arXiv ID. operationId: getPaperByID @@ -60,7 +71,7 @@ paths: content: application/json: schema: - $ref: '../schema/Document.json#Document' + $ref: './resources/Document.json#Document' default: description: unexpected error content: diff --git a/search/controllers/util.py b/search/controllers/util.py index 0444fc6f..a506f090 100644 --- a/search/controllers/util.py +++ b/search/controllers/util.py @@ -53,7 +53,7 @@ def paginate(query: Query, data: dict) -> Query: """ query.page_start = max(int(data.get('start', 0)), 0) - query.page_size = min(int(data.get('size', 50)), Query.MAXIMUM_PAGE_SIZE) + query.size = min(int(data.get('size', 50)), Query.MAXIMUM_size) return query diff --git a/search/domain/base.py b/search/domain/base.py index 6c7cf658..91dc0596 100644 --- a/search/domain/base.py +++ b/search/domain/base.py @@ -125,7 +125,7 @@ def __str__(self) -> str: class Query: """Represents a search query originating from the UI or API.""" - MAXIMUM_PAGE_SIZE = 500 + MAXIMUM_size = 500 """The maximum number of records that can be retrieved.""" SUPPORTED_FIELDS = [ @@ -147,7 +147,7 @@ class Query: ] order: Optional[str] = field(default=None) - page_size: int = field(default=50) + size: int = field(default=50) page_start: int = field(default=0) include_older_versions: bool = field(default=False) hide_abstracts: bool = field(default=False) @@ -155,12 +155,12 @@ class Query: @property def page_end(self) -> int: """Get the index/offset of the end of the page.""" - return self.page_start + self.page_size + return self.page_start + self.size @property def page(self) -> int: """Get the approximate page number.""" - return 1 + int(round(self.page_start/self.page_size)) + return 1 + int(round(self.page_start/self.size)) def __str__(self) -> str: """Build a string representation, for use in rendering.""" diff --git a/search/routes/ui.py b/search/routes/ui.py index b0ac8be5..efe8341d 100644 --- a/search/routes/ui.py +++ b/search/routes/ui.py @@ -151,12 +151,12 @@ def external_url(service: str, name: str, **parameters: Any) \ @blueprint.context_processor def url_for_page_builder() -> Dict[str, Callable]: """Add a page URL builder function to the template context.""" - def url_for_page(page: int, page_size: int) -> str: + def url_for_page(page: int, size: int) -> str: """Build an URL to for a search result page.""" rule = request.url_rule parts = url_parse(url_for(rule.endpoint)) args = request.args.copy() - args['start'] = (page - 1) * page_size + args['start'] = (page - 1) * size parts = parts.replace(query=url_encode(args)) url: str = url_unparse(parts) return url diff --git a/search/services/index/__init__.py b/search/services/index/__init__.py index 262d89ac..84d9f0d5 100644 --- a/search/services/index/__init__.py +++ b/search/services/index/__init__.py @@ -270,7 +270,8 @@ def add_document(self, document: Document) -> None: Parameters ---------- document : :class:`.Document` - Must be a valid search document, per ``schema/Document.json``. + Must be a valid search document, per + ``schema/DocumentMetadata.json``. Raises ------ @@ -297,7 +298,8 @@ def bulk_add_documents(self, documents: List[Document], Parameters ---------- document : :class:`.Document` - Must be a valid search document, per ``schema/Document.json``. + Must be a valid search document, per + ``schema/DocumentMetadata.json``. docs_per_chunk: int Number of documents to send to ES in a single chunk Raises @@ -379,7 +381,7 @@ def search(self, query: Query, highlight: bool = True) -> DocumentSet: """ # Make sure that the user is not requesting a nonexistant page. - max_pages = int(MAX_RESULTS/query.page_size) + max_pages = int(MAX_RESULTS/query.size) if query.page > max_pages: _message = f'Requested page {query.page}, but max is {max_pages}' logger.error(_message) diff --git a/search/services/index/results.py b/search/services/index/results.py index 88f89db5..fe188564 100644 --- a/search/services/index/results.py +++ b/search/services/index/results.py @@ -21,11 +21,16 @@ logger.propagate = False +def _to_author(author_data: dict) -> dict: + """Prevent e-mail, other extraneous data, from escaping.""" + return {k: v for k, v in author_data.items() if k != 'email'} + + def _to_document(raw: Response, highlight: bool = True) -> Document: """Transform an ES search result back into a :class:`.Document`.""" # typing: ignore result: Dict[str, Any] = {} - result['highlight'] = {} + result['match'] = {} # Hit on field, but no highlighting. result['truncated'] = {} # Preview is truncated. for key in Document.fields(): @@ -40,6 +45,11 @@ def _to_document(raw: Response, highlight: bool = True) -> Document: elif isinstance(value, AttrDict): value = value.to_dict() + if key in ['authors', 'owners']: + value = [_to_author(au) for au in value] + elif key == 'submitter': + value = _to_author(value) + if key == 'announced_date_first' and value and isinstance(value, str): value = datetime.strptime(value, '%Y-%m').date() if key in ['submitted_date', 'submitted_date_first', @@ -62,8 +72,10 @@ def _to_document(raw: Response, highlight: bool = True) -> Document: result['truncated']['abstract'] = True if highlight: + result['highlight'] = {} logger.debug('%s: add highlighting to result', raw.paper_id) result = add_highlighting(result, raw) + return Document(**result) # type: ignore # See https://github.com/python/mypy/issues/3937 @@ -87,21 +99,21 @@ def to_documentset(query: Query, response: Response, highlight: bool = True) \ page, along with pagination metadata. """ - max_pages = int(MAX_RESULTS/query.page_size) - N_pages_raw = response['hits']['total']/query.page_size + max_pages = int(MAX_RESULTS/query.size) + N_pages_raw = response['hits']['total']/query.size N_pages = int(floor(N_pages_raw)) + \ - int(N_pages_raw % query.page_size > 0) + int(N_pages_raw % query.size > 0) logger.debug('got %i results', response['hits']['total']) return DocumentSet(**{ # type: ignore 'metadata': { 'start': query.page_start, - 'end': min(query.page_start + query.page_size, + 'end': min(query.page_start + query.size, response['hits']['total']), 'total': response['hits']['total'], 'current_page': query.page, 'total_pages': N_pages, - 'page_size': query.page_size, + 'size': query.size, 'max_pages': max_pages }, 'results': [_to_document(raw, highlight=highlight) for raw in response] diff --git a/search/services/index/tests/tests.py b/search/services/index/tests/tests.py index 34d5e3ec..e671c1fa 100644 --- a/search/services/index/tests/tests.py +++ b/search/services/index/tests/tests.py @@ -41,7 +41,7 @@ def test_advanced_query(self, mock_Elasticsearch, mock_Search): query = AdvancedQuery( order='relevance', - page_size=10, + size=10, date_range=DateRange( start_date=datetime.now() - timedelta(days=5), end_date=datetime.now() @@ -80,7 +80,7 @@ def test_advanced_query(self, mock_Elasticsearch, mock_Search): self.assertEqual(document_set.metadata['total'], 53) self.assertEqual(document_set.metadata['current_page'], 1) self.assertEqual(document_set.metadata['total_pages'], 6) - self.assertEqual(document_set.metadata['page_size'], 10) + self.assertEqual(document_set.metadata['size'], 10) self.assertEqual(len(document_set.results), 1) @mock.patch('search.services.index.Search') @@ -105,7 +105,7 @@ def test_simple_query(self, mock_Elasticsearch, mock_Search): query = SimpleQuery( order='relevance', - page_size=10, + size=10, search_field='title', value='foo title' ) @@ -115,7 +115,7 @@ def test_simple_query(self, mock_Elasticsearch, mock_Search): self.assertEqual(document_set.metadata['total'], 53) self.assertEqual(document_set.metadata['current_page'], 1) self.assertEqual(document_set.metadata['total_pages'], 6) - self.assertEqual(document_set.metadata['page_size'], 10) + self.assertEqual(document_set.metadata['size'], 10) self.assertEqual(len(document_set.results), 1) diff --git a/search/templates/search/advanced_search.html b/search/templates/search/advanced_search.html index 7a2efb42..092abd9f 100644 --- a/search/templates/search/advanced_search.html +++ b/search/templates/search/advanced_search.html @@ -328,7 +328,7 @@
{# TODO - adjust this layout so that it matches across all forms less awkwardly #}
{% if query %} - Simple Search + Simple Search {% else %} Simple Search {% endif %} @@ -353,7 +353,7 @@
{% if query %} - Simple Search + Simple Search {% else %} Simple Search {% endif %} diff --git a/search/templates/search/search-macros.html b/search/templates/search/search-macros.html index d4ffec14..a7b64000 100644 --- a/search/templates/search/search-macros.html +++ b/search/templates/search/search-macros.html @@ -34,7 +34,7 @@ {% macro pagination(metadata, url_for_page) -%}