Skip to content
This repository has been archived by the owner on Oct 10, 2024. It is now read-only.

Commit

Permalink
pylint fixes, call paged_articles in all_articles
Browse files Browse the repository at this point in the history
  • Loading branch information
rahulbot committed Dec 6, 2023
1 parent 1e213c5 commit 9abc517
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 25 deletions.
12 changes: 12 additions & 0 deletions .pylintrc
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[MASTER]
disable=
C0114, # missing-module-docstring
C0115, # missing-class-docstring
C0116, # missing-function-docstring
C0209, # consider-using-f-string
R0913, # too-many-arguments

[FORMAT]
# Maximum number of characters on a single line.
max-line-length=120

6 changes: 3 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,20 @@
#! /usr/bin/env python
from setuptools import setup
import re
import os
from setuptools import setup

REQUIRED_PACKAGES = [
# utilities
"requests==2.*", # widely used HTTP library
"ciso8601==2.2.*" # super-fast date parsing
]

with open('waybacknews/__init__.py', 'r') as fd:
with open('waybacknews/__init__.py', 'r', encoding='utf8') as fd:
version = re.search(r'^__version__\s*=\s*[\'"]([^\'"]*)[\'"]', fd.read(), re.MULTILINE).group(1)

# add README.md to distribution
this_directory = os.path.abspath(os.path.dirname(__file__))
with open(os.path.join(this_directory, 'README.md')) as f:
with open(os.path.join(this_directory, 'README.md'), 'r', encoding='utf8') as f:
long_description = f.read()

setup(name='wayback-news-search',
Expand Down
32 changes: 15 additions & 17 deletions waybacknews/searchapi.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import datetime as dt
from typing import List, Dict, Optional
import requests
import logging
import requests
import ciso8601
import waybacknews.util as util
from waybacknews import util

VERSION = "v1" # the API access URL is versioned for future compatability and maintenance

Expand Down Expand Up @@ -89,7 +89,7 @@ def _date_query_clause(start_date: dt.datetime, end_date: dt.datetime) -> str:
def _overview_query(self, query: str, start_date: dt.datetime, end_date: dt.datetime, **kwargs) -> Dict:
params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
params.update(kwargs)
results, response = self._query("{}/search/overview".format(self._collection), params, method='POST')
results, _ = self._query("{}/search/overview".format(self._collection), params, method='POST')
return results

def article(self, article_id: str) -> Dict:
Expand All @@ -104,17 +104,17 @@ def all_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetim
params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
params.update(kwargs)
more_pages = True
next_page_token = None
while more_pages:
page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
page, next_page_token = self.paged_articles(query, start_date, end_date, page_size, **kwargs,
pagination_token=next_page_token)
if self._is_no_results(page):
yield []
else:
yield page
# check if there is a link to the next page
more_pages = False
next_link_token = response.headers.get('x-resume-token')
if next_link_token:
params['resume'] = next_link_token
if next_page_token:
more_pages = True

def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datetime,
Expand All @@ -126,18 +126,16 @@ def paged_articles(self, query: str, start_date: dt.datetime, end_date: dt.datet
if pagination_token:
params['resume'] = pagination_token
params.update(kwargs)
more_pages = True
page, response = self._query("{}/search/result".format(self._collection), params, method='POST')
if self._is_no_results(page):
return []
else:
return page, response.headers.get('x-resume-token')
return page, response.headers.get('x-resume-token')

def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str, **kwargs) -> Dict:
def terms(self, query: str, start_date: dt.datetime, end_date: dt.datetime, field: str, aggregation: str,
**kwargs) -> Dict:
params = {"q": "{} AND {}".format(query, self._date_query_clause(start_date, end_date))}
params.update(kwargs)
results, response = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params,
method='GET')
results, _ = self._query("{}/terms/{}/{}".format(self._collection, field, aggregation), params, method='GET')
return results

def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'):
Expand All @@ -155,9 +153,9 @@ def _query(self, endpoint: str, params: Dict = None, method: str = 'GET'):
r = self._session.post(endpoint_url, json=params, timeout=self.TIMEOUT_SECS)
else:
raise RuntimeError("Unsupported method of '{}'".format(method))

if r.status_code >= 500:
raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {}, Params: {}".
format(r.status_code, endpoint_url, params))
raise RuntimeError("API Server Error {}: a bad query string could have triggered this. Endpoint: {},"
" Params: {}".format(r.status_code, endpoint_url, params))

return r.json(), r
2 changes: 1 addition & 1 deletion waybacknews/tests/test_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def test_sanitize_query(self):
assert sanitized == "url:*dailyvoice.com\/new-york\/mountpleasant*"

def test_dict_to_list(self):
api_like_data = dict(key1='value1', key2='value2')
api_like_data = { 'key1': 'value1', 'key2':'value2' }
list_version = util.dict_to_list(api_like_data)
assert len(list_version) == 2
assert list_version[0]['name'] == 'key1'
Expand Down
8 changes: 4 additions & 4 deletions waybacknews/tests/test_waybacknews.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,8 @@ def test_sample(self):
assert 'publication_date' in r

def test_article(self):
STORY_ID = "ZDY3YzdlNWE3YTJkMDZiYTcwNjJhNTZiZjY5YzczMTY~'}"
story = self._api.article(STORY_ID)
story_id = "ZDY3YzdlNWE3YTJkMDZiYTcwNjJhNTZiZjY5YzczMTY~'}"
story = self._api.article(story_id)
assert len(story['title']) > 0
assert story['language'] == 'en'
assert story['domain'] == 'dailyvoice.com'
Expand Down Expand Up @@ -152,7 +152,7 @@ def test_top_terms(self):
field=SearchApiClient.TERM_FIELD_SNIPPET,
aggregation=SearchApiClient.TERM_AGGREGATION_TOP)
last_count = 99999999999
for term, count in results.items():
for _, count in results.items():
assert last_count >= count
last_count = count

Expand All @@ -163,7 +163,7 @@ def test_content_via_article_url(self):
end_date = dt.datetime(2022, 3, 4)
for page in self._api.all_articles(query, start_date, end_date):
for article in page[:5]:
article_info = requests.get(article['article_url']).json()
article_info = requests.get(article['article_url'], timeout=30).json()
assert 'snippet' in article_info
assert len(article_info['snippet']) > 0
break
Expand Down

0 comments on commit 9abc517

Please sign in to comment.