From 9034e89d40b7f73eafb9bb865a54d8920edeaa2b Mon Sep 17 00:00:00 2001 From: neonet1 <38730172+neonet1@users.noreply.github.com> Date: Mon, 1 Apr 2024 20:57:25 -0400 Subject: [PATCH 01/11] added books endpoint (pending confidential url mechanism) --- app.py | 28 ++++++++++++++++++++++++---- src/book_scraper.py | 22 ++++++++++++++-------- src/books.py | 7 ------- src/courses.py | 0 templates/books.html | 10 ++++++++++ tests/sample_test.py | 2 -- 6 files changed, 48 insertions(+), 21 deletions(-) delete mode 100644 src/books.py delete mode 100644 src/courses.py create mode 100644 templates/books.html delete mode 100644 tests/sample_test.py diff --git a/app.py b/app.py index fba1d84..7d5f45e 100644 --- a/app.py +++ b/app.py @@ -1,10 +1,30 @@ -from flask import Flask +from flask import Flask, request, render_template, jsonify +# from flask_caching import Cache + +from src.book_scraper import BookScraper + +""" +Mounts a prefix for all routes +""" +SCRIPT_NAME = "/api" + +""" +Scraper for book API +""" +SCRAPER = BookScraper("") """ The main application """ app = Flask(__name__) -@app.route("/") -def root(): - return "Hello World!" +@app.route("/books") +def get_books(): + results = SCRAPER.scrape() + + accepts = request.headers.get("Accept") + match accepts: + case "text/html": + return render_template("books.html", results) + case _: + return jsonify(results) \ No newline at end of file diff --git a/src/book_scraper.py b/src/book_scraper.py index 7059bcd..aaffc78 100644 --- a/src/book_scraper.py +++ b/src/book_scraper.py @@ -31,17 +31,16 @@ def parse(self, html: str) -> List[Dict[str, str]]: # Iterate through children out = [] - for child in root.children: - if isinstance(child, str) or child["class"] == "clear": - continue + for child in root.select("div.Updates"): + assert child.name == "div" # Get first column tags - c1 = soup.select_one("div.firstcol") + c1 = child.select_one("div.firstcol") image_tag = c1.img # Get second column tags - c2 = soup.select_one("div.secondcol") + c2 = child.select_one("div.secondcol") info_tag = c2.select_one("div.whos-review") @@ -60,9 +59,16 @@ def parse(self, html: str) -> List[Dict[str, str]]: image_url = image_tag["src"] - progress_percent = progress_tag.text.strip("()%") - assert progress_percent.isdigit() - + # Extract progress percentage + progress_text = progress_tag.text.strip("()%") + if progress_text.startswith("page"): + split = progress_text.split() + current = int(split[1]) + total = int(split[-1]) + progress_percent = (100*current)//total + else: + progress_percent = int(progress_text) + # Output parsed data out.append({ "title": title, diff --git a/src/books.py b/src/books.py deleted file mode 100644 index 9e5f5f8..0000000 --- a/src/books.py +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/src/courses.py b/src/courses.py deleted file mode 100644 index e69de29..0000000 diff --git a/templates/books.html b/templates/books.html new file mode 100644 index 0000000..9c58587 --- /dev/null +++ b/templates/books.html @@ -0,0 +1,10 @@ +
+ {% for result in results %} +
+ {{result['title']}} +

{{result['title']}}

+

By {{result['author']}}

+

Progress: {{result['progress']}}%

+
+ {% endfor %} +
diff --git a/tests/sample_test.py b/tests/sample_test.py deleted file mode 100644 index a6f1c78..0000000 --- a/tests/sample_test.py +++ /dev/null @@ -1,2 +0,0 @@ -def test_sample(): - assert True \ No newline at end of file From d291772a5ad41b5e837bcc6dedaae182f357051d Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:18:11 -0400 Subject: [PATCH 02/11] Fixed caching, simplified app --- .gitignore | 4 +++- app.py | 23 +++++++++++++++-------- src/book_scraper.py | 2 -- 3 files changed, 18 insertions(+), 11 deletions(-) diff --git a/.gitignore b/.gitignore index 0b1f484..0ea76dd 100644 --- a/.gitignore +++ b/.gitignore @@ -171,4 +171,6 @@ cython_debug/ # option (not recommended) you can uncomment the following to ignore the entire idea folder. #.idea/ -.ruff_cache \ No newline at end of file +.ruff_cache + +env_variables.yaml \ No newline at end of file diff --git a/app.py b/app.py index 7d5f45e..246afc3 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,5 @@ from flask import Flask, request, render_template, jsonify -# from flask_caching import Cache +from flask_caching import Cache from src.book_scraper import BookScraper @@ -13,18 +13,25 @@ """ SCRAPER = BookScraper("") +""" +App configuration +""" +config = { + "DEBUG": True, # some Flask specific configs + "CACHE_TYPE": "SimpleCache", # Flask-Caching related configs + "CACHE_DEFAULT_TIMEOUT": 300 +} + """ The main application """ app = Flask(__name__) +app.config.from_mapping(config) + +cache = Cache(app) @app.route("/books") +@cache.cached(timeout=86400) def get_books(): results = SCRAPER.scrape() - - accepts = request.headers.get("Accept") - match accepts: - case "text/html": - return render_template("books.html", results) - case _: - return jsonify(results) \ No newline at end of file + return render_template("books.html", results), 200 \ No newline at end of file diff --git a/src/book_scraper.py b/src/book_scraper.py index aaffc78..66a0208 100644 --- a/src/book_scraper.py +++ b/src/book_scraper.py @@ -1,13 +1,11 @@ from typing import List, Dict import requests from bs4 import BeautifulSoup -from cachetools import cached, TTLCache class BookScraper: def __init__(self, url: str): self.url = url - @cached(cache=TTLCache(maxsize=1, ttl=86400)) def scrape(self) -> List[Dict[str, str]]: html = self.request() responses = self.parse(html) From 0810eb009b3a8435a08ae1917087c9ba59733909 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:36:40 -0400 Subject: [PATCH 03/11] Added environment variable getting to app --- app.py | 4 +++- env.py | 3 +++ 2 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 env.py diff --git a/app.py b/app.py index 246afc3..9c1dfe4 100644 --- a/app.py +++ b/app.py @@ -1,6 +1,8 @@ from flask import Flask, request, render_template, jsonify from flask_caching import Cache +from env import GOODREADS_URL + from src.book_scraper import BookScraper """ @@ -11,7 +13,7 @@ """ Scraper for book API """ -SCRAPER = BookScraper("") +SCRAPER = BookScraper(GOODREADS_URL) """ App configuration diff --git a/env.py b/env.py new file mode 100644 index 0000000..d8f909c --- /dev/null +++ b/env.py @@ -0,0 +1,3 @@ +import os + +GOODREADS_URL = os.environ.get('GOODREADS_URL') \ No newline at end of file From 3d9a46ecd2c872182228f5526a98c6650f0c17a0 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:37:51 -0400 Subject: [PATCH 04/11] Added assert to scraper --- src/book_scraper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/book_scraper.py b/src/book_scraper.py index 66a0208..f337090 100644 --- a/src/book_scraper.py +++ b/src/book_scraper.py @@ -4,6 +4,7 @@ class BookScraper: def __init__(self, url: str): + assert url is not None self.url = url def scrape(self) -> List[Dict[str, str]]: From a6ac4bb0c5a3be8dd19b367210ba16ae72811c27 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:43:27 -0400 Subject: [PATCH 05/11] Added env variables support to app.yaml --- app.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/app.yaml b/app.yaml index 83a0c28..979c22f 100644 --- a/app.yaml +++ b/app.yaml @@ -14,12 +14,12 @@ runtime: python310 +includes: + - env_variables.yaml + # Commented out because this is going to be the default service # service: samoyedapi -# env_variables: -# PORT: "8000" - instance_class: F1 automatic_scaling: From d432c8c87807fb06c69c1ac767ff9d47cd243d25 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:46:37 -0400 Subject: [PATCH 06/11] Added stuff to gitignore --- .gitignore | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 0ea76dd..162840a 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,5 @@ cython_debug/ .ruff_cache -env_variables.yaml \ No newline at end of file +env_variables.yaml +env.ps1 \ No newline at end of file From 4903629a7a9465a46944e77329233372511ec8b9 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:48:21 -0400 Subject: [PATCH 07/11] Added doc string --- src/book_scraper.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/book_scraper.py b/src/book_scraper.py index f337090..700560a 100644 --- a/src/book_scraper.py +++ b/src/book_scraper.py @@ -3,6 +3,10 @@ from bs4 import BeautifulSoup class BookScraper: + """ + Scrapes the available book data at the given url. + """ + def __init__(self, url: str): assert url is not None self.url = url From c5e5775233043b419f9f35643ea976cff1fbad00 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:54:08 -0400 Subject: [PATCH 08/11] Simplified test case --- tests/test_book_scraper.py | 39 +------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) diff --git a/tests/test_book_scraper.py b/tests/test_book_scraper.py index 1597a11..5376ed7 100644 --- a/tests/test_book_scraper.py +++ b/tests/test_book_scraper.py @@ -163,7 +163,7 @@ def test_book_scraper_parser():   - (4%) + (4%)
@@ -172,43 +172,6 @@ def test_book_scraper_parser(): -Mar 26, 2024 11:47PM - - - -
- - - - - - - - -
  - - - - - - -
- - - - -
""" From c2ffa7ae663fcfa69dbc864fff581efb753e4866 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:55:18 -0400 Subject: [PATCH 09/11] Tests pass --- tests/test_book_scraper.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_book_scraper.py b/tests/test_book_scraper.py index 5376ed7..eba84c1 100644 --- a/tests/test_book_scraper.py +++ b/tests/test_book_scraper.py @@ -185,5 +185,5 @@ def test_book_scraper_parser(): "author": "Sylvia Plath", "author_url": "http://goodreads.com/author/show/4379.Sylvia_Plath", "image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1554582218l/6514._SX98_.jpg", - "progress": "4" + "progress": 4 } == responses[0] \ No newline at end of file From c53da06edac2c095683c39353cdceb3c5c12ce25 Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 19:56:22 -0400 Subject: [PATCH 10/11] Deleted example main --- main.py | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 main.py diff --git a/main.py b/main.py deleted file mode 100644 index 1b7a161..0000000 --- a/main.py +++ /dev/null @@ -1,17 +0,0 @@ -from flask import Flask #, render_template - -app = Flask(__name__) - -@app.route("/") -def root(): - return "Hello World!" - -if __name__ == "__main__": - # This is used when running locally only. When deploying to Google App - # Engine, a webserver process such as Gunicorn will serve the app. This - # can be configured by adding an `entrypoint` to app.yaml. - # Flask's development server will automatically serve static files in - # the "static" directory. See: - # http://flask.pocoo.org/docs/1.0/quickstart/#static-files. Once deployed, - # App Engine itself will serve those files as configured in app.yaml. - app.run(host="127.0.0.1", port=8080, debug=True) \ No newline at end of file From ba8fae49804be055fb860f3fd412031344571a4e Mon Sep 17 00:00:00 2001 From: jar2333 Date: Wed, 1 May 2024 20:19:14 -0400 Subject: [PATCH 11/11] Added headers to better scrape --- app.py | 2 +- src/book_scraper.py | 10 +++++++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/app.py b/app.py index 9c1dfe4..3e3c0d4 100644 --- a/app.py +++ b/app.py @@ -36,4 +36,4 @@ @cache.cached(timeout=86400) def get_books(): results = SCRAPER.scrape() - return render_template("books.html", results), 200 \ No newline at end of file + return render_template("books.html", results=results), 200 \ No newline at end of file diff --git a/src/book_scraper.py b/src/book_scraper.py index 700560a..c56e33d 100644 --- a/src/book_scraper.py +++ b/src/book_scraper.py @@ -19,7 +19,15 @@ def scrape(self) -> List[Dict[str, str]]: def request(self) -> str: try: - response = requests.get(self.url) + response = requests.get(self.url, headers={ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Accept-Language': 'en-US', + 'Connection': 'keep-alive', + 'Host': 'www.goodreads.com', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0', + }) response.raise_for_status() except Exception as err: raise err