From 9034e89d40b7f73eafb9bb865a54d8920edeaa2b Mon Sep 17 00:00:00 2001
From: neonet1 <38730172+neonet1@users.noreply.github.com>
Date: Mon, 1 Apr 2024 20:57:25 -0400
Subject: [PATCH 01/11] added books endpoint (pending confidential url
mechanism)
---
app.py | 28 ++++++++++++++++++++++++----
src/book_scraper.py | 22 ++++++++++++++--------
src/books.py | 7 -------
src/courses.py | 0
templates/books.html | 10 ++++++++++
tests/sample_test.py | 2 --
6 files changed, 48 insertions(+), 21 deletions(-)
delete mode 100644 src/books.py
delete mode 100644 src/courses.py
create mode 100644 templates/books.html
delete mode 100644 tests/sample_test.py
diff --git a/app.py b/app.py
index fba1d84..7d5f45e 100644
--- a/app.py
+++ b/app.py
@@ -1,10 +1,30 @@
-from flask import Flask
+from flask import Flask, request, render_template, jsonify
+# from flask_caching import Cache
+
+from src.book_scraper import BookScraper
+
+"""
+Mounts a prefix for all routes
+"""
+SCRIPT_NAME = "/api"
+
+"""
+Scraper for book API
+"""
+SCRAPER = BookScraper("")
"""
The main application
"""
app = Flask(__name__)
-@app.route("/")
-def root():
- return "Hello World!"
+@app.route("/books")
+def get_books():
+ results = SCRAPER.scrape()
+
+ accepts = request.headers.get("Accept")
+ match accepts:
+ case "text/html":
+ return render_template("books.html", results)
+ case _:
+ return jsonify(results)
\ No newline at end of file
diff --git a/src/book_scraper.py b/src/book_scraper.py
index 7059bcd..aaffc78 100644
--- a/src/book_scraper.py
+++ b/src/book_scraper.py
@@ -31,17 +31,16 @@ def parse(self, html: str) -> List[Dict[str, str]]:
# Iterate through children
out = []
- for child in root.children:
- if isinstance(child, str) or child["class"] == "clear":
- continue
+ for child in root.select("div.Updates"):
+ assert child.name == "div"
# Get first column tags
- c1 = soup.select_one("div.firstcol")
+ c1 = child.select_one("div.firstcol")
image_tag = c1.img
# Get second column tags
- c2 = soup.select_one("div.secondcol")
+ c2 = child.select_one("div.secondcol")
info_tag = c2.select_one("div.whos-review")
@@ -60,9 +59,16 @@ def parse(self, html: str) -> List[Dict[str, str]]:
image_url = image_tag["src"]
- progress_percent = progress_tag.text.strip("()%")
- assert progress_percent.isdigit()
-
+ # Extract progress percentage
+ progress_text = progress_tag.text.strip("()%")
+ if progress_text.startswith("page"):
+ split = progress_text.split()
+ current = int(split[1])
+ total = int(split[-1])
+ progress_percent = (100*current)//total
+ else:
+ progress_percent = int(progress_text)
+
# Output parsed data
out.append({
"title": title,
diff --git a/src/books.py b/src/books.py
deleted file mode 100644
index 9e5f5f8..0000000
--- a/src/books.py
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
-
-
-
-
diff --git a/src/courses.py b/src/courses.py
deleted file mode 100644
index e69de29..0000000
diff --git a/templates/books.html b/templates/books.html
new file mode 100644
index 0000000..9c58587
--- /dev/null
+++ b/templates/books.html
@@ -0,0 +1,10 @@
+
+ {% for result in results %}
+
+ {% endfor %}
+
diff --git a/tests/sample_test.py b/tests/sample_test.py
deleted file mode 100644
index a6f1c78..0000000
--- a/tests/sample_test.py
+++ /dev/null
@@ -1,2 +0,0 @@
-def test_sample():
- assert True
\ No newline at end of file
From d291772a5ad41b5e837bcc6dedaae182f357051d Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:18:11 -0400
Subject: [PATCH 02/11] Fixed caching, simplified app
---
.gitignore | 4 +++-
app.py | 23 +++++++++++++++--------
src/book_scraper.py | 2 --
3 files changed, 18 insertions(+), 11 deletions(-)
diff --git a/.gitignore b/.gitignore
index 0b1f484..0ea76dd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -171,4 +171,6 @@ cython_debug/
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
-.ruff_cache
\ No newline at end of file
+.ruff_cache
+
+env_variables.yaml
\ No newline at end of file
diff --git a/app.py b/app.py
index 7d5f45e..246afc3 100644
--- a/app.py
+++ b/app.py
@@ -1,5 +1,5 @@
from flask import Flask, request, render_template, jsonify
-# from flask_caching import Cache
+from flask_caching import Cache
from src.book_scraper import BookScraper
@@ -13,18 +13,25 @@
"""
SCRAPER = BookScraper("")
+"""
+App configuration
+"""
+config = {
+ "DEBUG": True, # some Flask specific configs
+ "CACHE_TYPE": "SimpleCache", # Flask-Caching related configs
+ "CACHE_DEFAULT_TIMEOUT": 300
+}
+
"""
The main application
"""
app = Flask(__name__)
+app.config.from_mapping(config)
+
+cache = Cache(app)
@app.route("/books")
+@cache.cached(timeout=86400)
def get_books():
results = SCRAPER.scrape()
-
- accepts = request.headers.get("Accept")
- match accepts:
- case "text/html":
- return render_template("books.html", results)
- case _:
- return jsonify(results)
\ No newline at end of file
+ return render_template("books.html", results), 200
\ No newline at end of file
diff --git a/src/book_scraper.py b/src/book_scraper.py
index aaffc78..66a0208 100644
--- a/src/book_scraper.py
+++ b/src/book_scraper.py
@@ -1,13 +1,11 @@
from typing import List, Dict
import requests
from bs4 import BeautifulSoup
-from cachetools import cached, TTLCache
class BookScraper:
def __init__(self, url: str):
self.url = url
- @cached(cache=TTLCache(maxsize=1, ttl=86400))
def scrape(self) -> List[Dict[str, str]]:
html = self.request()
responses = self.parse(html)
From 0810eb009b3a8435a08ae1917087c9ba59733909 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:36:40 -0400
Subject: [PATCH 03/11] Added environment variable getting to app
---
app.py | 4 +++-
env.py | 3 +++
2 files changed, 6 insertions(+), 1 deletion(-)
create mode 100644 env.py
diff --git a/app.py b/app.py
index 246afc3..9c1dfe4 100644
--- a/app.py
+++ b/app.py
@@ -1,6 +1,8 @@
from flask import Flask, request, render_template, jsonify
from flask_caching import Cache
+from env import GOODREADS_URL
+
from src.book_scraper import BookScraper
"""
@@ -11,7 +13,7 @@
"""
Scraper for book API
"""
-SCRAPER = BookScraper("")
+SCRAPER = BookScraper(GOODREADS_URL)
"""
App configuration
diff --git a/env.py b/env.py
new file mode 100644
index 0000000..d8f909c
--- /dev/null
+++ b/env.py
@@ -0,0 +1,3 @@
+import os
+
+GOODREADS_URL = os.environ.get('GOODREADS_URL')
\ No newline at end of file
From 3d9a46ecd2c872182228f5526a98c6650f0c17a0 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:37:51 -0400
Subject: [PATCH 04/11] Added assert to scraper
---
src/book_scraper.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/src/book_scraper.py b/src/book_scraper.py
index 66a0208..f337090 100644
--- a/src/book_scraper.py
+++ b/src/book_scraper.py
@@ -4,6 +4,7 @@
class BookScraper:
def __init__(self, url: str):
+ assert url is not None
self.url = url
def scrape(self) -> List[Dict[str, str]]:
From a6ac4bb0c5a3be8dd19b367210ba16ae72811c27 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:43:27 -0400
Subject: [PATCH 05/11] Added env variables support to app.yaml
---
app.yaml | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/app.yaml b/app.yaml
index 83a0c28..979c22f 100644
--- a/app.yaml
+++ b/app.yaml
@@ -14,12 +14,12 @@
runtime: python310
+includes:
+ - env_variables.yaml
+
# Commented out because this is going to be the default service
# service: samoyedapi
-# env_variables:
-# PORT: "8000"
-
instance_class: F1
automatic_scaling:
From d432c8c87807fb06c69c1ac767ff9d47cd243d25 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:46:37 -0400
Subject: [PATCH 06/11] Added stuff to gitignore
---
.gitignore | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/.gitignore b/.gitignore
index 0ea76dd..162840a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,4 +173,5 @@ cython_debug/
.ruff_cache
-env_variables.yaml
\ No newline at end of file
+env_variables.yaml
+env.ps1
\ No newline at end of file
From 4903629a7a9465a46944e77329233372511ec8b9 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:48:21 -0400
Subject: [PATCH 07/11] Added doc string
---
src/book_scraper.py | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/src/book_scraper.py b/src/book_scraper.py
index f337090..700560a 100644
--- a/src/book_scraper.py
+++ b/src/book_scraper.py
@@ -3,6 +3,10 @@
from bs4 import BeautifulSoup
class BookScraper:
+ """
+ Scrapes the available book data at the given url.
+ """
+
def __init__(self, url: str):
assert url is not None
self.url = url
From c5e5775233043b419f9f35643ea976cff1fbad00 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:54:08 -0400
Subject: [PATCH 08/11] Simplified test case
---
tests/test_book_scraper.py | 39 +-------------------------------------
1 file changed, 1 insertion(+), 38 deletions(-)
diff --git a/tests/test_book_scraper.py b/tests/test_book_scraper.py
index 1597a11..5376ed7 100644
--- a/tests/test_book_scraper.py
+++ b/tests/test_book_scraper.py
@@ -163,7 +163,7 @@ def test_book_scraper_parser():
- (4%)
+ (4%)
@@ -172,43 +172,6 @@ def test_book_scraper_parser():
—
-Mar 26, 2024 11:47PM
-
-
-
-
-
-
-
-
-
-
"""
From c2ffa7ae663fcfa69dbc864fff581efb753e4866 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:55:18 -0400
Subject: [PATCH 09/11] Tests pass
---
tests/test_book_scraper.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/test_book_scraper.py b/tests/test_book_scraper.py
index 5376ed7..eba84c1 100644
--- a/tests/test_book_scraper.py
+++ b/tests/test_book_scraper.py
@@ -185,5 +185,5 @@ def test_book_scraper_parser():
"author": "Sylvia Plath",
"author_url": "http://goodreads.com/author/show/4379.Sylvia_Plath",
"image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1554582218l/6514._SX98_.jpg",
- "progress": "4"
+ "progress": 4
} == responses[0]
\ No newline at end of file
From c53da06edac2c095683c39353cdceb3c5c12ce25 Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 19:56:22 -0400
Subject: [PATCH 10/11] Deleted example main
---
main.py | 17 -----------------
1 file changed, 17 deletions(-)
delete mode 100644 main.py
diff --git a/main.py b/main.py
deleted file mode 100644
index 1b7a161..0000000
--- a/main.py
+++ /dev/null
@@ -1,17 +0,0 @@
-from flask import Flask #, render_template
-
-app = Flask(__name__)
-
-@app.route("/")
-def root():
- return "Hello World!"
-
-if __name__ == "__main__":
- # This is used when running locally only. When deploying to Google App
- # Engine, a webserver process such as Gunicorn will serve the app. This
- # can be configured by adding an `entrypoint` to app.yaml.
- # Flask's development server will automatically serve static files in
- # the "static" directory. See:
- # http://flask.pocoo.org/docs/1.0/quickstart/#static-files. Once deployed,
- # App Engine itself will serve those files as configured in app.yaml.
- app.run(host="127.0.0.1", port=8080, debug=True)
\ No newline at end of file
From ba8fae49804be055fb860f3fd412031344571a4e Mon Sep 17 00:00:00 2001
From: jar2333
Date: Wed, 1 May 2024 20:19:14 -0400
Subject: [PATCH 11/11] Added headers to better scrape
---
app.py | 2 +-
src/book_scraper.py | 10 +++++++++-
2 files changed, 10 insertions(+), 2 deletions(-)
diff --git a/app.py b/app.py
index 9c1dfe4..3e3c0d4 100644
--- a/app.py
+++ b/app.py
@@ -36,4 +36,4 @@
@cache.cached(timeout=86400)
def get_books():
results = SCRAPER.scrape()
- return render_template("books.html", results), 200
\ No newline at end of file
+ return render_template("books.html", results=results), 200
\ No newline at end of file
diff --git a/src/book_scraper.py b/src/book_scraper.py
index 700560a..c56e33d 100644
--- a/src/book_scraper.py
+++ b/src/book_scraper.py
@@ -19,7 +19,15 @@ def scrape(self) -> List[Dict[str, str]]:
def request(self) -> str:
try:
- response = requests.get(self.url)
+ response = requests.get(self.url, headers={
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+ 'Accept-Encoding': 'gzip, deflate, br, zstd',
+ 'Accept-Language': 'en-US',
+ 'Connection': 'keep-alive',
+ 'Host': 'www.goodreads.com',
+ 'Upgrade-Insecure-Requests': '1',
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0',
+ })
response.raise_for_status()
except Exception as err:
raise err