From f9cc2c68a68362dcf1ede103713278117021eef5 Mon Sep 17 00:00:00 2001 From: mucchu Date: Sun, 24 Dec 2023 14:48:16 +0530 Subject: [PATCH 1/7] modified scraper.py to use requests instead of urllib. Additionally removed the line importing json lib twice since its already imported in line 7. --- openedx2zim/scraper.py | 45 ++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 17 deletions(-) diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py index 4e1f1d5..808f6aa 100644 --- a/openedx2zim/scraper.py +++ b/openedx2zim/scraper.py @@ -12,9 +12,8 @@ import shutil import sys import tempfile -import urllib +import requests import uuid -import json from bs4 import BeautifulSoup from kiwixstorage import KiwixStorage @@ -125,7 +124,6 @@ def __init__( watcher_min_dl_count, watcher_min_ratio, ): - # video-encoding info self.video_format = video_format self.low_quality = low_quality @@ -230,7 +228,7 @@ def get_course_id(self, url, course_page_name, course_prefix, instance_url): ] if "%3" in clean_id: # course_id seems already encode return clean_id - return urllib.parse.quote_plus(clean_id) + return requests.utils.quote(clean_id, safe="") def prepare_mooc_data(self): self.instance_url = self.instance_config["instance_url"] @@ -259,9 +257,16 @@ def prepare_mooc_data(self): ) self.course_xblocks = xblocks_data["blocks"] if self.debug: - with tempfile.NamedTemporaryFile(dir=self.build_dir.joinpath("logs"), suffix=".json", mode="wt", delete=False) as fp: + with tempfile.NamedTemporaryFile( + dir=self.build_dir.joinpath("logs"), + suffix=".json", + mode="wt", + delete=False, + ) as fp: json.dump(self.course_xblocks, fp, indent=4) - logger.debug(f"Saved API response while fetching list of course blocks in {fp.name}") + logger.debug( + f"Saved API response while fetching list of course blocks in {fp.name}" + ) self.root_xblock_id = xblocks_data["root"] def parse_course_xblocks(self): @@ -334,7 +339,9 @@ def get_book_list(self, book, output_path): pdf = book.find_all("a") book_list = [] for url in pdf: - file_name = pathlib.Path(urllib.parse.urlparse(url["rel"][0]).path).name + file_name = pathlib.Path( + requests.utils.urlparse(url.get("rel", [""])[0]).path + ).name if self.download_file( prepare_url(url["rel"][0], self.instance_url), output_path.joinpath(file_name), @@ -346,7 +353,9 @@ def annex_extra_page(self, tab_href, tab_org_path): output_path = self.build_dir.joinpath(tab_org_path) output_path.mkdir(parents=True, exist_ok=True) try: - page_content = self.instance_connection.get_page(self.instance_url + tab_href) + page_content = self.instance_connection.get_page( + self.instance_url + tab_href + ) except Exception: logger.error(f"Failed to get page content for tab {tab_org_path}") raise SystemExit(1) @@ -491,7 +500,7 @@ def annex(self): self.forum.annex_forum() def get_favicon(self): - """ get the favicon from the given URL for the instance or the fallback URL """ + """get the favicon from the given URL for the instance or the fallback URL""" favicon_fpath = self.build_dir.joinpath("favicon.png") @@ -506,10 +515,10 @@ def get_favicon(self): raise Exception("Favicon download failed") def get_content(self): - """ download the content for the course """ + """download the content for the course""" def clean_content(html_article): - """ removes unwanted elements from homepage html """ + """removes unwanted elements from homepage html""" unwanted_elements = { "div": {"class": "dismiss-message"}, @@ -582,7 +591,9 @@ def clean_content(html_article): concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED) if BaseXblock.too_many_failures(): - logger.error("Stopping scrapper because too many errors occured while getting content") + logger.error( + "Stopping scrapper because too many errors occured while getting content" + ) if self.debug: print("Xblock download failure details:", file=sys.stderr) json.dump(BaseXblock.watcher.failed_xblocks, sys.stderr, indent=4) @@ -605,7 +616,7 @@ def s3_credentials_ok(self): return True def download_from_cache(self, key, fpath, meta): - """ whether it downloaded from S3 cache """ + """whether it downloaded from S3 cache""" filetype = "jpeg" if fpath.suffix in [".jpeg", ".jpg"] else fpath.suffix[1:] if not self.s3_storage.has_object(key) or not meta: @@ -627,7 +638,7 @@ def download_from_cache(self, key, fpath, meta): return True def upload_to_cache(self, key, fpath, meta): - """ whether it uploaded to S3 cache """ + """whether it uploaded to S3 cache""" filetype = "jpeg" if fpath.suffix in [".jpeg", ".jpg"] else fpath.suffix[1:] if not meta or not filetype: @@ -727,9 +738,9 @@ def generate_s3_key(self, url, fpath): quality = "low" if self.low_quality else "high" else: quality = "default" - src_url = urllib.parse.urlparse(url) + src_url = requests.utils.urlparse(url) prefix = f"{src_url.scheme}://{src_url.netloc}/" - safe_url = f"{src_url.netloc}/{urllib.parse.quote_plus(src_url.geturl()[len(prefix):])}" + safe_url = f"{src_url.netloc}/{requests.utils.quote(src_url.geturl()[len(prefix):], safe='~')}" # safe url looks similar to ww2.someplace.state.gov/data%2F%C3%A9t%C3%A9%2Fsome+chars%2Fimage.jpeg%3Fv%3D122%26from%3Dxxx%23yes return f"{fpath.suffix[1:]}/{safe_url}/{quality}" @@ -868,7 +879,7 @@ def run(self): ) # update instance config - instance_netloc = urllib.parse.urlparse(self.course_url).netloc + instance_netloc = requests.utils.urlparse(self.course_url).netloc self.instance_config.update({"instance_url": f"https://{instance_netloc}"}) logger.info("Testing openedx instance credentials ...") self.instance_connection = InstanceConnection( From 3f6cfbaa1d21393ec98f9b32b2952989633e8879 Mon Sep 17 00:00:00 2001 From: mucchu Date: Sun, 24 Dec 2023 14:59:12 +0530 Subject: [PATCH 2/7] Modified html_processor.py file to use requests instead of urllib and formatted using black --- openedx2zim/html_processor.py | 58 ++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 25 deletions(-) diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py index e4e6cb6..8c3c5a5 100644 --- a/openedx2zim/html_processor.py +++ b/openedx2zim/html_processor.py @@ -1,6 +1,6 @@ import pathlib import re -import urllib +import requests import xxhash import lxml.html @@ -26,9 +26,10 @@ def download_and_get_filename( """downloads a file from src and return the name of the downloaded file with_ext: ensure that downloaded file has the given extension - filter_ext: download only if the file to download has an extension in this list""" + filter_ext: download only if the file to download has an extension in this list + """ - server_path = pathlib.Path(urllib.parse.urlparse(src).path) + server_path = pathlib.Path(requests.utils.urlparse(src).path) ext = with_ext if with_ext else server_path.suffix if server_path.suffix: @@ -57,7 +58,8 @@ def download_dependencies_from_css( - css_org_url: URL to the CSS file on the internet - css_path: path of CSS on the filesystem (Path) - - output_path_from_css: string representing path of the output directory relative to css_path""" + - output_path_from_css: string representing path of the output directory relative to css_path + """ def encapsulate(url): return f"url({url})" @@ -71,7 +73,7 @@ def remove_quotes(url): # ensure the original CSS url has netloc css_org_url = prepare_url(css_org_url, netloc, path_on_server) - css_org_url = urllib.parse.urlparse(css_org_url) + css_org_url = requests.utils.urlparse(css_org_url) with open(css_path, "r") as fp: content = fp.read() @@ -93,7 +95,7 @@ def remove_quotes(url): continue # add netloc if not present - parsed_url = urllib.parse.urlparse(css_url) + parsed_url = requests.utils.urlparse(css_url) if parsed_url.netloc == "": if parsed_url.path.startswith("/"): css_url = ( @@ -120,7 +122,7 @@ def remove_quotes(url): path_on_server=path_on_server, with_ext=".css", ) - parsed_css_url = urllib.parse.urlparse(css_url) + parsed_css_url = requests.utils.urlparse(css_url) self.download_dependencies_from_css( css_org_url=css_url, css_path=output_path.joinpath(filename), @@ -147,7 +149,7 @@ def remove_quotes(url): def download_images_from_html( self, html_body, output_path, path_from_html, netloc, path_on_server ): - """ download images from tag and fix path """ + """download images from tag and fix path""" imgs = html_body.xpath("//img") for img in imgs: @@ -171,7 +173,7 @@ def download_images_from_html( return bool(imgs) def get_root_from_asset(self, path_from_html, root_from_html): - """ get path to root from the downloaded/generated asset """ + """get path to root from the downloaded/generated asset""" # return original root if path_from_html is empty if path_from_html == "": @@ -200,7 +202,7 @@ def download_documents_from_html( netloc, path_on_server, ): - """ download documents from tag and fix path """ + """download documents from tag and fix path""" anchors = html_body.xpath("//a") for anchor in anchors: @@ -242,7 +244,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u """get the path and netloc to send recursively after downloading asset from downloaded_asset_url path_on_server is the current path on server and netloc is the current netloc""" - parsed_src = urllib.parse.urlparse(downloaded_asset_url) + parsed_src = requests.utils.urlparse(downloaded_asset_url) path_recursive = path_on_server if parsed_src.path: asset_path_on_server = pathlib.Path(parsed_src.path) @@ -262,7 +264,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u def download_css_from_html( self, html_body, output_path, path_from_html, netloc, path_on_server ): - """ download css files from tag and fix path """ + """download css files from tag and fix path""" css_files = html_body.xpath("//link") for css in css_files: @@ -298,7 +300,7 @@ def download_css_from_html( def download_js_from_html( self, html_body, output_path, path_from_html, netloc, path_on_server ): - """ download javascript from