Merge pull request #7 from jar2333/books

Books route
jar2333 · May 2, 2024 · a0e6a98 · a0e6a98
2 parents 645923f + ba8fae4
commit a0e6a98
Show file tree

Hide file tree

Showing 11 changed files with 83 additions and 84 deletions.
diff --git a/.gitignore b/.gitignore
@@ -171,4 +171,7 @@ cython_debug/
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 
-.ruff_cache
+.ruff_cache
+
+env_variables.yaml
+env.ps1
diff --git a/app.py b/app.py
@@ -1,10 +1,39 @@
-from flask import Flask
+from flask import Flask, request, render_template, jsonify
+from flask_caching import Cache
+
+from env import GOODREADS_URL
+
+from src.book_scraper import BookScraper
+
+"""
+Mounts a prefix for all routes
+"""
+SCRIPT_NAME = "/api"
+
+"""
+Scraper for book API
+"""
+SCRAPER = BookScraper(GOODREADS_URL)
+
+"""
+App configuration
+"""
+config = {
+    "DEBUG": True,          # some Flask specific configs
+    "CACHE_TYPE": "SimpleCache",  # Flask-Caching related configs
+    "CACHE_DEFAULT_TIMEOUT": 300
+}
 
 """
 The main application
 """
 app = Flask(__name__)
+app.config.from_mapping(config)
+
+cache = Cache(app)
 
-@app.route("/")
-def root():
-    return "Hello World!"
+@app.route("/books")
+@cache.cached(timeout=86400)
+def get_books():
+    results = SCRAPER.scrape()
+    return render_template("books.html", results=results), 200
diff --git a/app.yaml b/app.yaml
@@ -14,12 +14,12 @@
 
 runtime: python310
 
+includes:
+  - env_variables.yaml
+
 # Commented out because this is going to be the default service
 # service: samoyedapi
 
-# env_variables:
-#   PORT: "8000"
-
 instance_class: F1
 
 automatic_scaling:

diff --git a/env.py b/env.py
@@ -0,0 +1,3 @@
+import os
+
+GOODREADS_URL = os.environ.get('GOODREADS_URL')
diff --git a/main.py b/main.py
diff --git a/src/book_scraper.py b/src/book_scraper.py
@@ -1,13 +1,16 @@
 from typing import List, Dict
 import requests
 from bs4 import BeautifulSoup
-from cachetools import cached, TTLCache
 
 class BookScraper:
+    """
+    Scrapes the available book data at the given url.
+    """
+
     def __init__(self, url: str):
+        assert url is not None
         self.url = url
 
-    @cached(cache=TTLCache(maxsize=1, ttl=86400))
     def scrape(self) -> List[Dict[str, str]]:
         html = self.request()
         responses = self.parse(html)
@@ -16,7 +19,15 @@ def scrape(self) -> List[Dict[str, str]]:
 
     def request(self) -> str:
         try:
-            response = requests.get(self.url)
+            response = requests.get(self.url, headers={
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
+                'Accept-Encoding': 'gzip, deflate, br, zstd',
+                'Accept-Language': 'en-US',
+                'Connection': 'keep-alive',
+                'Host': 'www.goodreads.com',
+                'Upgrade-Insecure-Requests': '1',
+                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0',
+            })
             response.raise_for_status()
         except Exception as err:
             raise err
@@ -31,17 +42,16 @@ def parse(self, html: str) -> List[Dict[str, str]]:
 
         # Iterate through children
         out = []
-        for child in root.children:
-            if isinstance(child, str) or child["class"] == "clear":
-                continue
+        for child in root.select("div.Updates"):
+            assert child.name == "div"
 
             # Get first column tags
-            c1 = soup.select_one("div.firstcol")
+            c1 = child.select_one("div.firstcol")
 
             image_tag = c1.img
 
             # Get second column tags
-            c2 = soup.select_one("div.secondcol")
+            c2 = child.select_one("div.secondcol")
 
             info_tag = c2.select_one("div.whos-review")
 
@@ -60,9 +70,16 @@ def parse(self, html: str) -> List[Dict[str, str]]:
 
             image_url = image_tag["src"]
 
-            progress_percent = progress_tag.text.strip("()%")
-            assert progress_percent.isdigit()
-
+            # Extract progress percentage
+            progress_text = progress_tag.text.strip("()%")
+            if progress_text.startswith("page"):
+                split = progress_text.split()
+                current = int(split[1])
+                total = int(split[-1])
+                progress_percent = (100*current)//total
+            else:
+                progress_percent = int(progress_text)
+
             # Output parsed data
             out.append({
                 "title": title,

diff --git a/src/books.py b/src/books.py
diff --git a/src/courses.py b/src/courses.py
diff --git a/templates/books.html b/templates/books.html
@@ -0,0 +1,10 @@
+<div id="books">
+    {% for result in results %}
+    <div class="book">
+        <img alt="{{result['title']}}" src="{{result['image_url']}}">
+        <a href="{{result['url']}}"><p>{{result['title']}}</p></a>
+        <a href="{{result['author_url']}}"><p>By {{result['author']}}</p></a>
+        <p>Progress: {{result['progress']}}%</p>
+    </div>
+    {% endfor %}
+</div>
diff --git a/tests/sample_test.py b/tests/sample_test.py
diff --git a/tests/test_book_scraper.py b/tests/test_book_scraper.py
@@ -163,7 +163,7 @@ def test_book_scraper_parser():
   </div>
 
   &nbsp;
-    <a onclick="if (typeof(clickPageOfBook) == 'function') {clickPageOfBook(594288, , {viewLink: '<a class=\&quot;greyText\&quot; href=\&quot;https://www.goodreads.com/user_status/show/813489194\&quot;>Mar 26, 2024 11:47PM<\/a>'});return false;};" class="greyText smallText" href="https://www.goodreads.com/user_status/show/813489194">(4%)</a>
+    <a onclick="if (typeof(clickPageOfBook) == 'function') {clickPageOfBook(594288, , {viewLink: '<a class=\&quot;greyText\&quot; href=\&quot;https://www.goodreads.com/user_status/show/13489194\&quot;>Mar 26, 2024 11:47PM<\/a>'});return false;};" class="greyText smallText" href="https://www.goodreads.com/user_status/show/13489194">(4%)</a>
 
   <br class="clear">
     <span class="readable">
@@ -172,43 +172,6 @@ def test_book_scraper_parser():
 
 
 <span class="greyText">—</span>
-<a class="greyText" href="https://www.goodreads.com/user_status/show/813489194">Mar 26, 2024 11:47PM</a>
-
-            </div>
-
-                <br class="clear">
-                <table>
-                
-
-        <tbody><tr class="no_border feedFooterReview" id="update_comment_stuff_Review4917911731">
-            <td>&nbsp;</td>
-        <td colspan="2">
-            <div class="updateActionLinks">
-
-
-
-
-
-                            <a class="updatedTimestamp" ref="timestamp" href="/review/show/4917911731">Sept 30, 2019 09:00AM</a>
-        &nbsp;·&nbsp;<a id="commentLink_4917911731" href="#" onclick="comment_form_for('review', 4917911731, true, ''); return false;">comment</a>
-            </div>
-
-
-            <div id="comments_for_review_4917911731" style="display: none;">
-        </div>
-        <div class="brown_comment" id="comments_form_review_4917911731" style="display: none">
-        <textarea class="placeholder_text" onclick="expand_comment_form_for('review', 4917911731, true, '')">Write a comment...</textarea>
-        </div>
-
-        </td>
-        </tr>
-
-                </tbody></table>
-            </div>
-            </div>
-
-
-        <div class="clear"></div></div>
 
     """
 
@@ -222,5 +185,5 @@ def test_book_scraper_parser():
             "author": "Sylvia Plath",
             "author_url": "http://goodreads.com/author/show/4379.Sylvia_Plath",
             "image_url": "https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1554582218l/6514._SX98_.jpg",
-            "progress": "4"
+            "progress": 4
     } == responses[0]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		import os

		GOODREADS_URL = os.environ.get('GOODREADS_URL')