pylint fixes, call paged_articles in all_articles

mediacloud · Dec 6, 2023 · 9abc517 · 9abc517
1 parent 1e213c5
commit 9abc517
Show file tree

Hide file tree

Showing 5 changed files with 35 additions and 25 deletions.
diff --git a/.pylintrc b/.pylintrc
@@ -0,0 +1,12 @@
+[MASTER]
+disable=
+    C0114, # missing-module-docstring
+    C0115, # missing-class-docstring
+    C0116, # missing-function-docstring
+    C0209, # consider-using-f-string
+    R0913, # too-many-arguments
+
+[FORMAT]
+# Maximum number of characters on a single line.
+max-line-length=120
+
diff --git a/setup.py b/setup.py
@@ -1,20 +1,20 @@
 #! /usr/bin/env python
-from setuptools import setup
 import re
 import os
+from setuptools import setup
 
 REQUIRED_PACKAGES = [
     # utilities
     "requests==2.*",  # widely used HTTP library
     "ciso8601==2.2.*"  # super-fast date parsing
 ]
 
-with open('waybacknews/__init__.py', 'r') as fd:
+with open('waybacknews/__init__.py', 'r', encoding='utf8') as fd:
     version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1)
 
 # add README.md to distribution
 this_directory = os.path.abspath(os.path.dirname(__file__))
-with open(os.path.join(this_directory, 'README.md')) as f:
+with open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf8') as f:
     long_description = f.read()
 
 setup(name='wayback-news-search',

diff --git a/waybacknews/searchapi.py b/waybacknews/searchapi.py
@@ -1,9 +1,9 @@
 import datetime as dt
 from typing import List, Dict, Optional
-import requests
 import logging
+import requests
 import ciso8601
-import waybacknews.util as util
+from waybacknews import util
 
 VERSION = "v1"  # the API access URL is versioned for future compatability and maintenance
 
@@ -89,7 +89,7 @@ def _date_query_clause(start_date: dt.datetime, end_date: dt.datetime) -> str:
     def _overview_query(self, query: str, start_date: dt.datetime, end_date: dt.datetime, **kwargs) -> Dict:
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)
-        results, response = self._query("{}/search/overview".format(self._collection), params, method='POST')
+        results, _ = self._query("{}/search/overview".format(self._collection), params, method='POST')
         return results
 
     def article(self, article_id: str) -> Dict:
@@ -104,17 +104,17 @@ def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetim
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)
         more_pages = True
+        next_page_token = None
         while more_pages:
-            page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
+            page, next_page_token = self.paged_articles(query, start_date, end_date, page_size, **kwargs,
+                                                        pagination_token=next_page_token)
             if self._is_no_results(page):
                 yield []
             else:
                 yield page
             # check if there is a link to the next page
             more_pages = False
-            next_link_token = response.headers.get('x-resume-token')
-            if next_link_token:
-                params['resume'] = next_link_token
+            if next_page_token:
                 more_pages = True
 
     def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime,
@@ -126,18 +126,16 @@ def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datet
         if pagination_token:
             params['resume'] = pagination_token
         params.update(kwargs)
-        more_pages = True
         page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
         if self._is_no_results(page):
             return []
-        else:
-            return page, response.headers.get('x-resume-token')
+        return page, response.headers.get('x-resume-token')
 
-    def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, **kwargs) -> Dict:
+    def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str,
+              **kwargs) -> Dict:
         params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
         params.update(kwargs)
-        results, response = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params,
-                                        method='GET')
+        results, _ = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params, method='GET')
         return results
 
     def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'):
@@ -155,9 +153,9 @@ def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'):
             r = self._session.post(endpoint_url, json=params, timeout=self.TIMEOUT_SECS)
         else:
             raise RuntimeError("Unsupported method of '{}'".format(method))
-        
+
         if r.status_code >= 500:
-            raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {}, Params: {}".
-                               format(r.status_code, endpoint_url, params))
-                               
+            raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {},"
+                               " Params: {}".format(r.status_code, endpoint_url, params))
+
         return r.json(), r
diff --git a/waybacknews/tests/test_util.py b/waybacknews/tests/test_util.py
@@ -11,7 +11,7 @@ def test_sanitize_query(self):
         assert sanitized == "url:*dailyvoice.com\/new-york\/mountpleasant*"
 
     def test_dict_to_list(self):
-        api_like_data = dict(key1='value1', key2='value2')
+        api_like_data = { 'key1': 'value1', 'key2':'value2' }
         list_version = util.dict_to_list(api_like_data)
         assert len(list_version) == 2
         assert list_version[0]['name'] == 'key1'

diff --git a/waybacknews/tests/test_waybacknews.py b/waybacknews/tests/test_waybacknews.py
@@ -79,8 +79,8 @@ def test_sample(self):
             assert 'publication_date' in r
 
     def test_article(self):
-        STORY_ID = "ZDY3YzdlNWE3YTJkMDZiYTcwNjJhNTZiZjY5YzczMTY~'}"
-        story = self._api.article(STORY_ID)
+        story_id = "ZDY3YzdlNWE3YTJkMDZiYTcwNjJhNTZiZjY5YzczMTY~'}"
+        story = self._api.article(story_id)
         assert len(story['title']) > 0
         assert story['language'] == 'en'
         assert story['domain'] == 'dailyvoice.com'
@@ -152,7 +152,7 @@ def test_top_terms(self):
                                   field=SearchApiClient.TERM_FIELD_SNIPPET,
                                   aggregation=SearchApiClient.TERM_AGGREGATION_TOP)
         last_count = 99999999999
-        for term, count in results.items():
+        for _, count in results.items():
             assert last_count >= count
             last_count = count
 
@@ -163,7 +163,7 @@ def test_content_via_article_url(self):
         end_date = dt.datetime(2022, 3, 4)
         for page in self._api.all_articles(query, start_date, end_date):
             for article in page[:5]:
-                article_info = requests.get(article['article_url']).json()
+                article_info = requests.get(article['article_url'], timeout=30).json()
                 assert 'snippet' in article_info
                 assert len(article_info['snippet']) > 0
             break