Atom serializer

arXiv · JaimieMurdock · Jun 11, 2019 · Feb 25, 2019 · Mar 6, 2019 · Mar 7, 2019
commit 6ee11de002d72287d83def85ad1467f5b8a0d898
diff --git a/Pipfile b/Pipfile
@@ -17,6 +17,7 @@ dataclasses = "==0.4"
 docutils = "==0.14"
 elasticsearch = "==6.2.0"
 elasticsearch-dsl = "==6.3.1"
+feedgen = "==0.7.0"
 flask = "==1.0.2"
 "flask-s3" = "==0.3.3"
 idna = "==2.6"

diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/search/controllers/api/__init__.py b/search/controllers/api/__init__.py
@@ -145,6 +145,15 @@ def classic_query(params: MultiDict) -> Tuple[Dict[str, Any], int, Dict[str, Any
         params['query'] = raw_query
         del params['search_query']
 
+        params.add('include', 'abstract')
+        params.add('include', 'submitted_date')
+        params.add('include', 'updated_date')
+        params.add('include', 'comments')
+        params.add('include', 'journal_ref')
+        params.add('include', 'doi')
+        params.add('include', 'primary_classification')
+        params.add('include', 'secondary_classification')
+        params.add('include', 'authors')
         # pass to normal search, which will handle parsing
         data, _, _ = search(params) # type: ignore
 

diff --git a/search/routes/api/atom_extensions.py b/search/routes/api/atom_extensions.py
@@ -0,0 +1,259 @@
+"""Classes derived from the Feedgen extension classes."""
+
+from typing import Dict
+from feedgen.ext.base import BaseEntryExtension, BaseExtension
+from feedgen.entry import FeedEntry
+from feedgen.feed import FeedGenerator
+from lxml import etree
+
+
+class OpenSearchExtension(BaseExtension):
+    """Extension of the Feedgen base class to put OpenSearch metadata."""
+    def __init__(self: BaseExtension) -> None:
+        """Initialize extension parameters."""
+
+        self.__opensearch_totalResults = None
+        self.__opensearch_startIndex = None
+        self.__opensearch_itemsPerPage = None
+
+    def extend_atom(self: BaseExtension, atom_feed: FeedGenerator) -> FeedGenerator:
+        """
+        Assign the Atom feed generator to the extension.
+
+        Parameters
+        ----------
+        atom_feed
+            The FeedGenerator to use for Atom results.
+
+        Returns
+        -------
+        FeedGenerator
+            The provided feed generator.
+
+        """
+        if self.__opensearch_itemsPerPage is not None:
+            elt = etree.SubElement(atom_feed, '{http://a9.com/-/spec/opensearch/1.1/}itemsPerPage')
+            elt.text= self.__opensearch_itemsPerPage
+
+        if self.__opensearch_totalResults is not None:
+            elt = etree.SubElement(atom_feed, '{http://a9.com/-/spec/opensearch/1.1/}totalResults')
+            elt.text= self.__opensearch_totalResults
+
+        if self.__opensearch_startIndex is not None:
+            elt = etree.SubElement(atom_feed, '{http://a9.com/-/spec/opensearch/1.1/}startIndex')
+            elt.text = self.__opensearch_startIndex
+
+        return atom_feed
+
+    @staticmethod
+    def extend_rss(rss_feed: FeedGenerator) -> FeedGenerator:
+        """
+        Assign the RSS feed generator to the extension.
+
+        Parameters
+        ----------
+        rss_feed
+            The FeedGenerator to use for RSS results.
+
+        Returns
+        -------
+        FeedGenerator
+            The provided feed generator.
+
+        """
+        return rss_feed
+
+    @staticmethod
+    def extend_ns() -> Dict[str, str]:
+        """
+        Assign the feed's namespace string.
+
+        Returns
+        -------
+        str
+            The definition string for the "arxiv" namespace.
+
+        """
+        return {'opensearch': 'http://a9.com/-/spec/opensearch/1.1/'}
+
+    def totalResults(self: BaseExtension, text: str):
+        """ Set the totalResults parameter. """
+        self.__opensearch_totalResults = str(text)
+
+    def startIndex(self: BaseExtension, text: str):
+        """ Set the startIndex parameter. """
+        self.__opensearch_startIndex = str(text)
+
+    def itemsPerPage(self: BaseExtension, text: str):
+        """ Set the itemsPerPage parameter. """
+        self.__opensearch_itemsPerPage = str(text)
+
+
+class ArxivExtension(BaseExtension):
+    """Extension of the Feedgen base class to allow us to define namespaces."""
+
+    def __init__(self: BaseExtension) -> None:
+        """Noop initialization."""
+        pass
+
+    @staticmethod
+    def extend_atom(atom_feed: FeedGenerator) -> FeedGenerator:
+        """
+        Assign the Atom feed generator to the extension.
+
+        Parameters
+        ----------
+        atom_feed
+            The FeedGenerator to use for Atom results.
+
+        Returns
+        -------
+        FeedGenerator
+            The provided feed generator.
+
+        """
+        return atom_feed
+
+    @staticmethod
+    def extend_rss(rss_feed: FeedGenerator) -> FeedGenerator:
+        """
+        Assign the RSS feed generator to the extension.
+
+        Parameters
+        ----------
+        rss_feed
+            The FeedGenerator to use for RSS results.
+
+        Returns
+        -------
+        FeedGenerator
+            The provided feed generator.
+
+        """
+        return rss_feed
+
+    @staticmethod
+    def extend_ns() -> Dict[str, str]:
+        """
+        Assign the feed's namespace string.
+
+        Returns
+        -------
+        str
+            The definition string for the "arxiv" namespace.
+
+        """
+        return {'arxiv': 'http://arxiv.org/schemas/atom'}
+
+
+class ArxivEntryExtension(BaseEntryExtension):
+    """Extension of the Feedgen base class to allow us to add elements to the Atom output."""
+
+    def __init__(self: BaseEntryExtension):
+        """Initialize the member values to all be empty."""
+        self.__arxiv_comment = None
+        self.__arxiv_primary_category = None
+        self.__arxiv_doi = None
+        self.__arxiv_affiliation = None
+        self.__arxiv_journal_ref = None
+
+    def extend_atom(self: BaseEntryExtension, entry: FeedEntry) -> FeedEntry:
+        """
+        Add this extension's new elements to the Atom feed entry.
+
+        Parameters
+        ----------
+        entry
+            The FeedEntry to modify.
+
+        Returns
+        -------
+        FeedEntry
+            The modified entry.
+
+        """
+        if self.__arxiv_comment:
+            comment_element = etree.SubElement(entry, '{http://arxiv.org/schemas/atom}comment')
+            comment_element.text = self.__arxiv_comment
+
+        if self.__arxiv_primary_category:
+            primary_category_element = etree.SubElement(entry, '{http://arxiv.org/schemas/atom}primary_category')
+            primary_category_element.attrib['term'] = self.__arxiv_primary_category
+
+        if self.__arxiv_journal_ref:
+            journal_ref_element =\
+                etree.SubElement(entry, '{http://arxiv.org/schemas/atom}journal_ref')
+            journal_ref_element.text = self.__arxiv_journal_ref
+
+        if self.__arxiv_doi:
+            for doi in self.__arxiv_doi:
+                doi_element = etree.SubElement(entry, '{http://arxiv.org/schemas/atom}doi')
+                doi_element.text = doi
+
+        return entry
+
+    @staticmethod
+    def extend_rss(entry: FeedEntry) -> FeedEntry:
+        """
+        Add this extension's new elements to the RSS feed entry.
+
+        Parameters
+        ----------
+        entry
+            The FeedEntry to modify.
+
+        Returns
+        -------
+        FeedEntry
+            The modfied entry.
+
+        """
+        return entry
+
+    def comment(self: BaseEntryExtension, text: str) -> None:
+        """
+        Assign the comment value to this entry.
+
+        Parameters
+        ----------
+        text
+            The new comment text.
+
+        """
+        self.__arxiv_comment = text
+
+    def primary_category(self: BaseEntryExtension, text: str) -> None:
+        """
+        Assign the primary_category value to this entry.
+
+        Parameters
+        ----------
+        text
+            The new primary_category name.
+
+        """
+        self.__arxiv_primary_category = text
+
+    def journal_ref(self: BaseEntryExtension, text: str) -> None:
+        """
+        Assign the journal_ref value to this entry.
+
+        Parameters
+        ----------
+        text
+            The new journal_ref value.
+
+        """
+        self.__arxiv_journal_ref = text
+
+    def doi(self: BaseEntryExtension, list: Dict[str, str]) -> None:
+        """
+        Assign the doi value to this entry.
+
+        Parameters
+        ----------
+        list
+            The new list of DOI assignments.
+
+        """
+        self.__arxiv_doi = list
diff --git a/search/routes/api/classic.py b/search/routes/api/classic.py
@@ -28,7 +28,7 @@ def query() -> Response:
     # requested = request.accept_mimetypes.best_match([JSON, ATOM_XML])
     # if requested == ATOM_XML:
     #     return serialize.as_atom(data), status, headers
-    response_data = serialize.as_json(data['results'], query=data['query'])
+    response_data = serialize.as_atom(data['results'], query=data['query'])
     return response_data, status_code, headers
 
 @blueprint.route('<arxiv:paper_id>v<string:version>', methods=['GET'])

diff --git a/search/routes/api/serialize.py b/search/routes/api/serialize.py
@@ -4,6 +4,11 @@
 from xml.etree import ElementTree as etree
 from flask import jsonify, url_for
 
+from datetime import datetime
+from feedgen.feed import FeedGenerator
+from pytz import utc
+from .atom_extensions import *
+
 from arxiv import status
 from search.domain import DocumentSet, Document, Classification, Person, \
     APIQuery
@@ -183,6 +188,77 @@ class AtomXMLSerializer(BaseSerializer):
         "opensearch": OPENSEARCH,
         "arxiv": ARXIV
     }
+
+    @classmethod
+    def transform_document(cls, fg: FeedGenerator, doc: Document,
+                           query: Optional[APIQuery] = None) -> None:
+        """Select a subset of :class:`Document` properties for public API."""
+        entry = fg.add_entry()
+        entry.id(url_for("api.paper", paper_id=doc.paper_id,
+                             version=doc.version, _external=True))
+        entry.title(doc.title)
+        entry.summary(doc.abstract)
+        entry.published(doc.submitted_date)
+        entry.updated(doc.updated_date)
+        entry.link({'href': url_for("api.paper", paper_id=doc.paper_id, version=doc.version, _external=True),
+                    "type": "text/html"})
+        #entry.link({'href': url_for("api.pdf", paper_id=doc.paper_id, version=doc.version, _external=True),
+        #            "type": "application/pdf", 'rel': 'related'})
+
+        if doc.comments:
+            entry.arxiv.comment(doc.comments)
+
+        if doc.journal_ref:
+            entry.arxiv.journal_ref(doc.journal_ref)
+
+        if doc.doi:
+            entry.arxiv.doi(doc.doi)
+
+        entry.arxiv.primary_category(doc.primary_classification.archive['id'])
+        entry.category(
+            term=doc.primary_classification.archive['id'],
+            scheme='http://arxiv.org://arxiv.org/schemas/atom')
+
+        for category in doc.secondary_classification:
+            entry.category(
+                term=category.archive['id'],
+                scheme='http://arxiv.org://arxiv.org/schemas/atom')
+
+        for author in doc.authors:
+            author_list = {"name": author.full_name}
+            entry.author(author_list)
+
+
+    @classmethod
+    def serialize(cls, document_set: DocumentSet,
+                  query: Optional[APIQuery] = None) -> str:
+        """Generate Atom response for a :class:`DocumentSet`."""
+        fg = FeedGenerator()
+        fg.register_extension('opensearch', OpenSearchExtension)
+        fg.register_extension("arxiv", ArxivExtension, ArxivEntryExtension, rss=False)
+        fg.id("http://api.arxiv.org/") # TODO: review API ID generation
+        fg.title(f"arXiv Query: {query}")
+        fg.link({"href" : "https://api.arxiv.org/", "type": 'application/atom+xml'})
+        fg.updated(datetime.utcnow().replace(tzinfo=utc))
+
+        fg.opensearch.totalResults(document_set.metadata.get('total'))
+        fg.opensearch.itemsPerPage(document_set.metadata.get('size'))
+        fg.opensearch.startIndex(document_set.metadata.get('start'))
+
+        for doc in document_set.results:
+            cls.transform_document(fg, doc, query=query)
+
+        serialized: str = fg.atom_str(pretty=True)
+        return serialized
+
+    @classmethod
+    def serialize_document(cls, document: Document,
+                           query: Optional[APIQuery] = None) -> str:
+        """Generate JSON for a single :class:`Document`."""
+        serialized: str = jsonify(
+            cls.transform_document(document, query=query)
+        )
+        return serialized
 #     fields = {
 #         'title': '{%s}title' % ATOM,
 #         'id': '{%s}id' % ATOM,
@@ -203,3 +279,12 @@ class AtomXMLSerializer(BaseSerializer):
 #
 #     def __repr__(cls) -> str:
 #         return etree.tostring(cls._root, pretty_print=True)
+
+
+def as_atom(document_or_set: Union[DocumentSet, Document],
+            query: Optional[APIQuery] = None) -> str:
+    """Serialize a :class:`DocumentSet` as Atom."""
+    if type(document_or_set) is DocumentSet:
+        return AtomXMLSerializer.serialize(document_or_set, query=query)  # type: ignore
+    return AtomXMLSerializer.serialize_document(document_or_set, query=query)  # type: ignore
+