Skip to content

Commit

Permalink
Merge pull request #7 from jar2333/books
Browse files Browse the repository at this point in the history
Books route
  • Loading branch information
jar2333 authored May 2, 2024
2 parents 645923f + ba8fae4 commit a0e6a98
Show file tree
Hide file tree
Showing 11 changed files with 83 additions and 84 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -171,4 +171,7 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

.ruff_cache
.ruff_cache

env_variables.yaml
env.ps1
37 changes: 33 additions & 4 deletions app.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,39 @@
from flask import Flask
from flask import Flask, request, render_template, jsonify

Check failure on line 1 in app.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

app.py:1:26: F401 `flask.request` imported but unused

Check failure on line 1 in app.py

View workflow job for this annotation

GitHub Actions / build

Ruff (F401)

app.py:1:52: F401 `flask.jsonify` imported but unused
from flask_caching import Cache

from env import GOODREADS_URL

from src.book_scraper import BookScraper

"""
Mounts a prefix for all routes
"""
SCRIPT_NAME = "/api"

"""
Scraper for book API
"""
SCRAPER = BookScraper(GOODREADS_URL)

"""
App configuration
"""
config = {
"DEBUG": True, # some Flask specific configs
"CACHE_TYPE": "SimpleCache", # Flask-Caching related configs
"CACHE_DEFAULT_TIMEOUT": 300
}

"""
The main application
"""
app = Flask(__name__)
app.config.from_mapping(config)

cache = Cache(app)

@app.route("/")
def root():
return "Hello World!"
@app.route("/books")
@cache.cached(timeout=86400)
def get_books():
results = SCRAPER.scrape()
return render_template("books.html", results=results), 200
6 changes: 3 additions & 3 deletions app.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@

runtime: python310

includes:
- env_variables.yaml

# Commented out because this is going to be the default service
# service: samoyedapi

# env_variables:
# PORT: "8000"

instance_class: F1

automatic_scaling:
Expand Down
3 changes: 3 additions & 0 deletions env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import os

GOODREADS_URL = os.environ.get('GOODREADS_URL')
17 changes: 0 additions & 17 deletions main.py

This file was deleted.

39 changes: 28 additions & 11 deletions src/book_scraper.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
from cachetools import cached, TTLCache

class BookScraper:
"""
Scrapes the available book data at the given url.
"""

def __init__(self, url: str):
assert url is not None
self.url = url

@cached(cache=TTLCache(maxsize=1, ttl=86400))
def scrape(self) -> List[Dict[str, str]]:
html = self.request()
responses = self.parse(html)
Expand All @@ -16,7 +19,15 @@ def scrape(self) -> List[Dict[str, str]]:

def request(self) -> str:
try:
response = requests.get(self.url)
response = requests.get(self.url, headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US',
'Connection': 'keep-alive',
'Host': 'www.goodreads.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0',
})
response.raise_for_status()
except Exception as err:
raise err
Expand All @@ -31,17 +42,16 @@ def parse(self, html: str) -> List[Dict[str, str]]:

# Iterate through children
out = []
for child in root.children:
if isinstance(child, str) or child["class"] == "clear":
continue
for child in root.select("div.Updates"):
assert child.name == "div"

# Get first column tags
c1 = soup.select_one("div.firstcol")
c1 = child.select_one("div.firstcol")

image_tag = c1.img

# Get second column tags
c2 = soup.select_one("div.secondcol")
c2 = child.select_one("div.secondcol")

info_tag = c2.select_one("div.whos-review")

Expand All @@ -60,9 +70,16 @@ def parse(self, html: str) -> List[Dict[str, str]]:

image_url = image_tag["src"]

progress_percent = progress_tag.text.strip("()%")
assert progress_percent.isdigit()

# Extract progress percentage
progress_text = progress_tag.text.strip("()%")
if progress_text.startswith("page"):
split = progress_text.split()
current = int(split[1])
total = int(split[-1])
progress_percent = (100*current)//total
else:
progress_percent = int(progress_text)

# Output parsed data
out.append({
"title": title,
Expand Down
7 changes: 0 additions & 7 deletions src/books.py

This file was deleted.

Empty file removed src/courses.py
Empty file.
10 changes: 10 additions & 0 deletions templates/books.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
<div id="books">
{% for result in results %}
<div class="book">
<img alt="{{result['title']}}" src="{{result['image_url']}}">
<a href="{{result['url']}}"><p>{{result['title']}}</p></a>
<a href="{{result['author_url']}}"><p>By {{result['author']}}</p></a>
<p>Progress: {{result['progress']}}%</p>
</div>
{% endfor %}
</div>
2 changes: 0 additions & 2 deletions tests/sample_test.py

This file was deleted.

41 changes: 2 additions & 39 deletions tests/test_book_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def test_book_scraper_parser():
</div>
&nbsp;
<a onclick="if (typeof(clickPageOfBook) == 'function') {clickPageOfBook(594288, , {viewLink: '<a class=\&quot;greyText\&quot; href=\&quot;https://www.goodreads.com/user_status/show/813489194\&quot;>Mar 26, 2024 11:47PM<\/a>'});return false;};" class="greyText smallText" href="https://www.goodreads.com/user_status/show/813489194">(4%)</a>
<a onclick="if (typeof(clickPageOfBook) == 'function') {clickPageOfBook(594288, , {viewLink: '<a class=\&quot;greyText\&quot; href=\&quot;https://www.goodreads.com/user_status/show/13489194\&quot;>Mar 26, 2024 11:47PM<\/a>'});return false;};" class="greyText smallText" href="https://www.goodreads.com/user_status/show/13489194">(4%)</a>
<br class="clear">
<span class="readable">
Expand All @@ -172,43 +172,6 @@ def test_book_scraper_parser():
<span class="greyText">—</span>
<a class="greyText" href="https://www.goodreads.com/user_status/show/813489194">Mar 26, 2024 11:47PM</a>
</div>
<br class="clear">
<table>
<tbody><tr class="no_border feedFooterReview" id="update_comment_stuff_Review4917911731">
<td>&nbsp;</td>
<td colspan="2">
<div class="updateActionLinks">
<a class="updatedTimestamp" ref="timestamp" href="/review/show/4917911731">Sept 30, 2019 09:00AM</a>
&nbsp;·&nbsp;<a id="commentLink_4917911731" href="#" onclick="comment_form_for('review', 4917911731, true, ''); return false;">comment</a>
</div>
<div id="comments_for_review_4917911731" style="display: none;">
</div>
<div class="brown_comment" id="comments_form_review_4917911731" style="display: none">
<textarea class="placeholder_text" onclick="expand_comment_form_for('review', 4917911731, true, '')">Write a comment...</textarea>
</div>
</td>
</tr>
</tbody></table>
</div>
</div>
<div class="clear"></div></div>
"""

Expand All @@ -222,5 +185,5 @@ def test_book_scraper_parser():
"author": "Sylvia Plath",
"author_url": "http://goodreads.com/author/show/4379.Sylvia_Plath",
"image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1554582218l/6514._SX98_.jpg",
"progress": "4"
"progress": 4
} == responses[0]

0 comments on commit a0e6a98

Please sign in to comment.