Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial boolean support ARXIVNG-1971 #234

Merged
merged 18 commits into from
Jun 11, 2019
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Atom serializer
JaimieMurdock committed Apr 2, 2019

Verified

This commit was signed with the committer’s verified signature. The key has expired.
coreyja Corey Alexander
commit 6ee11de002d72287d83def85ad1467f5b8a0d898
1 change: 1 addition & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -17,6 +17,7 @@ dataclasses = "==0.4"
docutils = "==0.14"
elasticsearch = "==6.2.0"
elasticsearch-dsl = "==6.3.1"
feedgen = "==0.7.0"
flask = "==1.0.2"
"flask-s3" = "==0.3.3"
idna = "==2.6"
13 changes: 10 additions & 3 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions search/controllers/api/__init__.py
Original file line number Diff line number Diff line change
@@ -145,6 +145,15 @@ def classic_query(params: MultiDict) -> Tuple[Dict[str, Any], int, Dict[str, Any
params['query'] = raw_query
del params['search_query']

params.add('include', 'abstract')
params.add('include', 'submitted_date')
params.add('include', 'updated_date')
params.add('include', 'comments')
params.add('include', 'journal_ref')
params.add('include', 'doi')
params.add('include', 'primary_classification')
params.add('include', 'secondary_classification')
params.add('include', 'authors')
# pass to normal search, which will handle parsing
data, _, _ = search(params) # type: ignore

259 changes: 259 additions & 0 deletions search/routes/api/atom_extensions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""Classes derived from the Feedgen extension classes."""

from typing import Dict
from feedgen.ext.base import BaseEntryExtension, BaseExtension
from feedgen.entry import FeedEntry
from feedgen.feed import FeedGenerator
from lxml import etree


class OpenSearchExtension(BaseExtension):
"""Extension of the Feedgen base class to put OpenSearch metadata."""
def __init__(self: BaseExtension) -> None:
"""Initialize extension parameters."""

self.__opensearch_totalResults = None
self.__opensearch_startIndex = None
self.__opensearch_itemsPerPage = None

def extend_atom(self: BaseExtension, atom_feed: FeedGenerator) -> FeedGenerator:
"""
Assign the Atom feed generator to the extension.

Parameters
----------
atom_feed
The FeedGenerator to use for Atom results.

Returns
-------
FeedGenerator
The provided feed generator.

"""
if self.__opensearch_itemsPerPage is not None:
elt = etree.SubElement(atom_feed, '{http://a9.com/-/spec/opensearch/1.1/}itemsPerPage')
elt.text= self.__opensearch_itemsPerPage

if self.__opensearch_totalResults is not None:
elt = etree.SubElement(atom_feed, '{http://a9.com/-/spec/opensearch/1.1/}totalResults')
elt.text= self.__opensearch_totalResults

if self.__opensearch_startIndex is not None:
elt = etree.SubElement(atom_feed, '{http://a9.com/-/spec/opensearch/1.1/}startIndex')
elt.text = self.__opensearch_startIndex

return atom_feed

@staticmethod
def extend_rss(rss_feed: FeedGenerator) -> FeedGenerator:
"""
Assign the RSS feed generator to the extension.

Parameters
----------
rss_feed
The FeedGenerator to use for RSS results.

Returns
-------
FeedGenerator
The provided feed generator.

"""
return rss_feed

@staticmethod
def extend_ns() -> Dict[str, str]:
"""
Assign the feed's namespace string.

Returns
-------
str
The definition string for the "arxiv" namespace.

"""
return {'opensearch': 'http://a9.com/-/spec/opensearch/1.1/'}

def totalResults(self: BaseExtension, text: str):
""" Set the totalResults parameter. """
self.__opensearch_totalResults = str(text)

def startIndex(self: BaseExtension, text: str):
""" Set the startIndex parameter. """
self.__opensearch_startIndex = str(text)

def itemsPerPage(self: BaseExtension, text: str):
""" Set the itemsPerPage parameter. """
self.__opensearch_itemsPerPage = str(text)


class ArxivExtension(BaseExtension):
"""Extension of the Feedgen base class to allow us to define namespaces."""

def __init__(self: BaseExtension) -> None:
"""Noop initialization."""
pass

@staticmethod
def extend_atom(atom_feed: FeedGenerator) -> FeedGenerator:
"""
Assign the Atom feed generator to the extension.

Parameters
----------
atom_feed
The FeedGenerator to use for Atom results.

Returns
-------
FeedGenerator
The provided feed generator.

"""
return atom_feed

@staticmethod
def extend_rss(rss_feed: FeedGenerator) -> FeedGenerator:
"""
Assign the RSS feed generator to the extension.

Parameters
----------
rss_feed
The FeedGenerator to use for RSS results.

Returns
-------
FeedGenerator
The provided feed generator.

"""
return rss_feed

@staticmethod
def extend_ns() -> Dict[str, str]:
"""
Assign the feed's namespace string.

Returns
-------
str
The definition string for the "arxiv" namespace.

"""
return {'arxiv': 'http://arxiv.org/schemas/atom'}


class ArxivEntryExtension(BaseEntryExtension):
"""Extension of the Feedgen base class to allow us to add elements to the Atom output."""

def __init__(self: BaseEntryExtension):
"""Initialize the member values to all be empty."""
self.__arxiv_comment = None
self.__arxiv_primary_category = None
self.__arxiv_doi = None
self.__arxiv_affiliation = None
self.__arxiv_journal_ref = None

def extend_atom(self: BaseEntryExtension, entry: FeedEntry) -> FeedEntry:
"""
Add this extension's new elements to the Atom feed entry.

Parameters
----------
entry
The FeedEntry to modify.

Returns
-------
FeedEntry
The modified entry.

"""
if self.__arxiv_comment:
comment_element = etree.SubElement(entry, '{http://arxiv.org/schemas/atom}comment')
comment_element.text = self.__arxiv_comment

if self.__arxiv_primary_category:
primary_category_element = etree.SubElement(entry, '{http://arxiv.org/schemas/atom}primary_category')
primary_category_element.attrib['term'] = self.__arxiv_primary_category

if self.__arxiv_journal_ref:
journal_ref_element =\
etree.SubElement(entry, '{http://arxiv.org/schemas/atom}journal_ref')
journal_ref_element.text = self.__arxiv_journal_ref

if self.__arxiv_doi:
for doi in self.__arxiv_doi:
doi_element = etree.SubElement(entry, '{http://arxiv.org/schemas/atom}doi')
doi_element.text = doi

return entry

@staticmethod
def extend_rss(entry: FeedEntry) -> FeedEntry:
"""
Add this extension's new elements to the RSS feed entry.

Parameters
----------
entry
The FeedEntry to modify.

Returns
-------
FeedEntry
The modfied entry.

"""
return entry

def comment(self: BaseEntryExtension, text: str) -> None:
"""
Assign the comment value to this entry.

Parameters
----------
text
The new comment text.

"""
self.__arxiv_comment = text

def primary_category(self: BaseEntryExtension, text: str) -> None:
"""
Assign the primary_category value to this entry.

Parameters
----------
text
The new primary_category name.

"""
self.__arxiv_primary_category = text

def journal_ref(self: BaseEntryExtension, text: str) -> None:
"""
Assign the journal_ref value to this entry.

Parameters
----------
text
The new journal_ref value.

"""
self.__arxiv_journal_ref = text

def doi(self: BaseEntryExtension, list: Dict[str, str]) -> None:
"""
Assign the doi value to this entry.

Parameters
----------
list
The new list of DOI assignments.

"""
self.__arxiv_doi = list
2 changes: 1 addition & 1 deletion search/routes/api/classic.py
Original file line number Diff line number Diff line change
@@ -28,7 +28,7 @@ def query() -> Response:
# requested = request.accept_mimetypes.best_match([JSON, ATOM_XML])
# if requested == ATOM_XML:
# return serialize.as_atom(data), status, headers
response_data = serialize.as_json(data['results'], query=data['query'])
response_data = serialize.as_atom(data['results'], query=data['query'])
return response_data, status_code, headers

@blueprint.route('<arxiv:paper_id>v<string:version>', methods=['GET'])
85 changes: 85 additions & 0 deletions search/routes/api/serialize.py
Original file line number Diff line number Diff line change
@@ -4,6 +4,11 @@
from xml.etree import ElementTree as etree
from flask import jsonify, url_for

from datetime import datetime
from feedgen.feed import FeedGenerator
from pytz import utc
from .atom_extensions import *

from arxiv import status
from search.domain import DocumentSet, Document, Classification, Person, \
APIQuery
@@ -183,6 +188,77 @@ class AtomXMLSerializer(BaseSerializer):
"opensearch": OPENSEARCH,
"arxiv": ARXIV
}

@classmethod
def transform_document(cls, fg: FeedGenerator, doc: Document,
query: Optional[APIQuery] = None) -> None:
"""Select a subset of :class:`Document` properties for public API."""
entry = fg.add_entry()
entry.id(url_for("api.paper", paper_id=doc.paper_id,
version=doc.version, _external=True))
entry.title(doc.title)
entry.summary(doc.abstract)
entry.published(doc.submitted_date)
entry.updated(doc.updated_date)
entry.link({'href': url_for("api.paper", paper_id=doc.paper_id, version=doc.version, _external=True),
"type": "text/html"})
#entry.link({'href': url_for("api.pdf", paper_id=doc.paper_id, version=doc.version, _external=True),
# "type": "application/pdf", 'rel': 'related'})

if doc.comments:
entry.arxiv.comment(doc.comments)

if doc.journal_ref:
entry.arxiv.journal_ref(doc.journal_ref)

if doc.doi:
entry.arxiv.doi(doc.doi)

entry.arxiv.primary_category(doc.primary_classification.archive['id'])
entry.category(
term=doc.primary_classification.archive['id'],
scheme='http://arxiv.org://arxiv.org/schemas/atom')

for category in doc.secondary_classification:
entry.category(
term=category.archive['id'],
scheme='http://arxiv.org://arxiv.org/schemas/atom')

for author in doc.authors:
author_list = {"name": author.full_name}
entry.author(author_list)


@classmethod
def serialize(cls, document_set: DocumentSet,
query: Optional[APIQuery] = None) -> str:
"""Generate Atom response for a :class:`DocumentSet`."""
fg = FeedGenerator()
fg.register_extension('opensearch', OpenSearchExtension)
fg.register_extension("arxiv", ArxivExtension, ArxivEntryExtension, rss=False)
fg.id("http://api.arxiv.org/") # TODO: review API ID generation
fg.title(f"arXiv Query: {query}")
fg.link({"href" : "https://api.arxiv.org/", "type": 'application/atom+xml'})
fg.updated(datetime.utcnow().replace(tzinfo=utc))

fg.opensearch.totalResults(document_set.metadata.get('total'))
fg.opensearch.itemsPerPage(document_set.metadata.get('size'))
fg.opensearch.startIndex(document_set.metadata.get('start'))

for doc in document_set.results:
cls.transform_document(fg, doc, query=query)

serialized: str = fg.atom_str(pretty=True)
return serialized

@classmethod
def serialize_document(cls, document: Document,
query: Optional[APIQuery] = None) -> str:
"""Generate JSON for a single :class:`Document`."""
serialized: str = jsonify(
cls.transform_document(document, query=query)
)
return serialized
# fields = {
# 'title': '{%s}title' % ATOM,
# 'id': '{%s}id' % ATOM,
@@ -203,3 +279,12 @@ class AtomXMLSerializer(BaseSerializer):
#
# def __repr__(cls) -> str:
# return etree.tostring(cls._root, pretty_print=True)


def as_atom(document_or_set: Union[DocumentSet, Document],
query: Optional[APIQuery] = None) -> str:
"""Serialize a :class:`DocumentSet` as Atom."""
if type(document_or_set) is DocumentSet:
return AtomXMLSerializer.serialize(document_or_set, query=query) # type: ignore
return AtomXMLSerializer.serialize_document(document_or_set, query=query) # type: ignore