From f9cc2c68a68362dcf1ede103713278117021eef5 Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Sun, 24 Dec 2023 14:48:16 +0530
Subject: [PATCH 1/7] modified scraper.py to use requests instead of urllib.
 Additionally removed the line importing json lib twice since its already
 imported in line 7.

---
 openedx2zim/scraper.py | 45 ++++++++++++++++++++++++++----------------
 1 file changed, 28 insertions(+), 17 deletions(-)

diff --git a/openedx2zim/scraper.py b/openedx2zim/scraper.py
index 4e1f1d5..808f6aa 100644
--- a/openedx2zim/scraper.py
+++ b/openedx2zim/scraper.py
@@ -12,9 +12,8 @@
 import shutil
 import sys
 import tempfile
-import urllib
+import requests
 import uuid
-import json
 
 from bs4 import BeautifulSoup
 from kiwixstorage import KiwixStorage
@@ -125,7 +124,6 @@ def __init__(
         watcher_min_dl_count,
         watcher_min_ratio,
     ):
-
         # video-encoding info
         self.video_format = video_format
         self.low_quality = low_quality
@@ -230,7 +228,7 @@ def get_course_id(self, url, course_page_name, course_prefix, instance_url):
         ]
         if "%3" in clean_id:  # course_id seems already encode
             return clean_id
-        return urllib.parse.quote_plus(clean_id)
+        return requests.utils.quote(clean_id, safe="")
 
     def prepare_mooc_data(self):
         self.instance_url = self.instance_config["instance_url"]
@@ -259,9 +257,16 @@ def prepare_mooc_data(self):
         )
         self.course_xblocks = xblocks_data["blocks"]
         if self.debug:
-            with tempfile.NamedTemporaryFile(dir=self.build_dir.joinpath("logs"), suffix=".json", mode="wt", delete=False) as fp:
+            with tempfile.NamedTemporaryFile(
+                dir=self.build_dir.joinpath("logs"),
+                suffix=".json",
+                mode="wt",
+                delete=False,
+            ) as fp:
                 json.dump(self.course_xblocks, fp, indent=4)
-                logger.debug(f"Saved API response while fetching list of course blocks in {fp.name}")
+                logger.debug(
+                    f"Saved API response while fetching list of course blocks in {fp.name}"
+                )
         self.root_xblock_id = xblocks_data["root"]
 
     def parse_course_xblocks(self):
@@ -334,7 +339,9 @@ def get_book_list(self, book, output_path):
         pdf = book.find_all("a")
         book_list = []
         for url in pdf:
-            file_name = pathlib.Path(urllib.parse.urlparse(url["rel"][0]).path).name
+            file_name = pathlib.Path(
+                requests.utils.urlparse(url.get("rel", [""])[0]).path
+            ).name
             if self.download_file(
                 prepare_url(url["rel"][0], self.instance_url),
                 output_path.joinpath(file_name),
@@ -346,7 +353,9 @@ def annex_extra_page(self, tab_href, tab_org_path):
         output_path = self.build_dir.joinpath(tab_org_path)
         output_path.mkdir(parents=True, exist_ok=True)
         try:
-            page_content = self.instance_connection.get_page(self.instance_url + tab_href)
+            page_content = self.instance_connection.get_page(
+                self.instance_url + tab_href
+            )
         except Exception:
             logger.error(f"Failed to get page content for tab {tab_org_path}")
             raise SystemExit(1)
@@ -491,7 +500,7 @@ def annex(self):
             self.forum.annex_forum()
 
     def get_favicon(self):
-        """ get the favicon from the given URL for the instance or the fallback URL """
+        """get the favicon from the given URL for the instance or the fallback URL"""
 
         favicon_fpath = self.build_dir.joinpath("favicon.png")
 
@@ -506,10 +515,10 @@ def get_favicon(self):
             raise Exception("Favicon download failed")
 
     def get_content(self):
-        """ download the content for the course """
+        """download the content for the course"""
 
         def clean_content(html_article):
-            """ removes unwanted elements from homepage html """
+            """removes unwanted elements from homepage html"""
 
             unwanted_elements = {
                 "div": {"class": "dismiss-message"},
@@ -582,7 +591,9 @@ def clean_content(html_article):
             concurrent.futures.wait(fs, return_when=concurrent.futures.ALL_COMPLETED)
 
         if BaseXblock.too_many_failures():
-            logger.error("Stopping scrapper because too many errors occured while getting content")
+            logger.error(
+                "Stopping scrapper because too many errors occured while getting content"
+            )
             if self.debug:
                 print("Xblock download failure details:", file=sys.stderr)
                 json.dump(BaseXblock.watcher.failed_xblocks, sys.stderr, indent=4)
@@ -605,7 +616,7 @@ def s3_credentials_ok(self):
         return True
 
     def download_from_cache(self, key, fpath, meta):
-        """ whether it downloaded from S3 cache """
+        """whether it downloaded from S3 cache"""
 
         filetype = "jpeg" if fpath.suffix in [".jpeg", ".jpg"] else fpath.suffix[1:]
         if not self.s3_storage.has_object(key) or not meta:
@@ -627,7 +638,7 @@ def download_from_cache(self, key, fpath, meta):
         return True
 
     def upload_to_cache(self, key, fpath, meta):
-        """ whether it uploaded to S3 cache """
+        """whether it uploaded to S3 cache"""
 
         filetype = "jpeg" if fpath.suffix in [".jpeg", ".jpg"] else fpath.suffix[1:]
         if not meta or not filetype:
@@ -727,9 +738,9 @@ def generate_s3_key(self, url, fpath):
             quality = "low" if self.low_quality else "high"
         else:
             quality = "default"
-        src_url = urllib.parse.urlparse(url)
+        src_url = requests.utils.urlparse(url)
         prefix = f"{src_url.scheme}://{src_url.netloc}/"
-        safe_url = f"{src_url.netloc}/{urllib.parse.quote_plus(src_url.geturl()[len(prefix):])}"
+        safe_url = f"{src_url.netloc}/{requests.utils.quote(src_url.geturl()[len(prefix):], safe='~')}"
         # safe url looks similar to ww2.someplace.state.gov/data%2F%C3%A9t%C3%A9%2Fsome+chars%2Fimage.jpeg%3Fv%3D122%26from%3Dxxx%23yes
         return f"{fpath.suffix[1:]}/{safe_url}/{quality}"
 
@@ -868,7 +879,7 @@ def run(self):
             )
 
         # update instance config
-        instance_netloc = urllib.parse.urlparse(self.course_url).netloc
+        instance_netloc = requests.utils.urlparse(self.course_url).netloc
         self.instance_config.update({"instance_url": f"https://{instance_netloc}"})
         logger.info("Testing openedx instance credentials ...")
         self.instance_connection = InstanceConnection(

From 3f6cfbaa1d21393ec98f9b32b2952989633e8879 Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Sun, 24 Dec 2023 14:59:12 +0530
Subject: [PATCH 2/7] Modified html_processor.py file to use requests instead
 of urllib and formatted using black

---
 openedx2zim/html_processor.py | 58 ++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 25 deletions(-)

diff --git a/openedx2zim/html_processor.py b/openedx2zim/html_processor.py
index e4e6cb6..8c3c5a5 100644
--- a/openedx2zim/html_processor.py
+++ b/openedx2zim/html_processor.py
@@ -1,6 +1,6 @@
 import pathlib
 import re
-import urllib
+import requests
 
 import xxhash
 import lxml.html
@@ -26,9 +26,10 @@ def download_and_get_filename(
         """downloads a file from src and return the name of the downloaded file
 
         with_ext: ensure that downloaded file has the given extension
-        filter_ext: download only if the file to download has an extension in this list"""
+        filter_ext: download only if the file to download has an extension in this list
+        """
 
-        server_path = pathlib.Path(urllib.parse.urlparse(src).path)
+        server_path = pathlib.Path(requests.utils.urlparse(src).path)
         ext = with_ext if with_ext else server_path.suffix
 
         if server_path.suffix:
@@ -57,7 +58,8 @@ def download_dependencies_from_css(
 
         - css_org_url: URL to the CSS file on the internet
         - css_path: path of CSS on the filesystem (Path)
-        - output_path_from_css: string representing path of the output directory relative to css_path"""
+        - output_path_from_css: string representing path of the output directory relative to css_path
+        """
 
         def encapsulate(url):
             return f"url({url})"
@@ -71,7 +73,7 @@ def remove_quotes(url):
 
         # ensure the original CSS url has netloc
         css_org_url = prepare_url(css_org_url, netloc, path_on_server)
-        css_org_url = urllib.parse.urlparse(css_org_url)
+        css_org_url = requests.utils.urlparse(css_org_url)
 
         with open(css_path, "r") as fp:
             content = fp.read()
@@ -93,7 +95,7 @@ def remove_quotes(url):
                 continue
 
             # add netloc if not present
-            parsed_url = urllib.parse.urlparse(css_url)
+            parsed_url = requests.utils.urlparse(css_url)
             if parsed_url.netloc == "":
                 if parsed_url.path.startswith("/"):
                     css_url = (
@@ -120,7 +122,7 @@ def remove_quotes(url):
                     path_on_server=path_on_server,
                     with_ext=".css",
                 )
-                parsed_css_url = urllib.parse.urlparse(css_url)
+                parsed_css_url = requests.utils.urlparse(css_url)
                 self.download_dependencies_from_css(
                     css_org_url=css_url,
                     css_path=output_path.joinpath(filename),
@@ -147,7 +149,7 @@ def remove_quotes(url):
     def download_images_from_html(
         self, html_body, output_path, path_from_html, netloc, path_on_server
     ):
-        """ download images from <img> tag and fix path """
+        """download images from <img> tag and fix path"""
 
         imgs = html_body.xpath("//img")
         for img in imgs:
@@ -171,7 +173,7 @@ def download_images_from_html(
         return bool(imgs)
 
     def get_root_from_asset(self, path_from_html, root_from_html):
-        """ get path to root from the downloaded/generated asset """
+        """get path to root from the downloaded/generated asset"""
 
         # return original root if path_from_html is empty
         if path_from_html == "":
@@ -200,7 +202,7 @@ def download_documents_from_html(
         netloc,
         path_on_server,
     ):
-        """ download documents from <a> tag and fix path """
+        """download documents from <a> tag and fix path"""
 
         anchors = html_body.xpath("//a")
         for anchor in anchors:
@@ -242,7 +244,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u
         """get the path and netloc to send recursively after downloading asset from downloaded_asset_url
         path_on_server is the current path on server and netloc is the current netloc"""
 
-        parsed_src = urllib.parse.urlparse(downloaded_asset_url)
+        parsed_src = requests.utils.urlparse(downloaded_asset_url)
         path_recursive = path_on_server
         if parsed_src.path:
             asset_path_on_server = pathlib.Path(parsed_src.path)
@@ -262,7 +264,7 @@ def get_path_and_netloc_to_send(self, netloc, path_on_server, downloaded_asset_u
     def download_css_from_html(
         self, html_body, output_path, path_from_html, netloc, path_on_server
     ):
-        """ download css files from <link> tag and fix path """
+        """download css files from <link> tag and fix path"""
 
         css_files = html_body.xpath("//link")
         for css in css_files:
@@ -298,7 +300,7 @@ def download_css_from_html(
     def download_js_from_html(
         self, html_body, output_path, path_from_html, netloc, path_on_server
     ):
-        """ download javascript from <script> tag and fix path """
+        """download javascript from <script> tag and fix path"""
 
         js_files = html_body.xpath("//script")
         for js in js_files:
@@ -320,7 +322,7 @@ def download_js_from_html(
     def download_sources_from_html(
         self, html_body, output_path, path_from_html, netloc, path_on_server
     ):
-        """ downloads content from <source> tags """
+        """downloads content from <source> tags"""
 
         sources = html_body.xpath("//source")
         for source in sources:
@@ -348,7 +350,7 @@ def download_iframes_from_html(
         netloc,
         path_on_server,
     ):
-        """ download youtube videos and pdf files from iframes in html content """
+        """download youtube videos and pdf files from iframes in html content"""
 
         iframes = html_body.xpath("//iframe")
         for iframe in iframes:
@@ -395,7 +397,9 @@ def download_iframes_from_html(
                     # handle iframe recursively
                     iframe_url = prepare_url(src, netloc)
                     try:
-                        src_content = self.scraper.instance_connection.get_page(iframe_url)
+                        src_content = self.scraper.instance_connection.get_page(
+                            iframe_url
+                        )
                     except Exception:
                         continue
                     path_recursive, netloc_recursive = self.get_path_and_netloc_to_send(
@@ -425,7 +429,7 @@ def download_iframes_from_html(
         return bool(iframes)
 
     def handle_jump_to_paths(self, target_path):
-        """ return a fixed path in zim for a inter-xblock path containing jump_to """
+        """return a fixed path in zim for a inter-xblock path containing jump_to"""
 
         def check_descendants_and_return_path(xblock_extractor):
             if xblock_extractor.xblock_json["type"] in ["vertical", "course"]:
@@ -436,7 +440,9 @@ def check_descendants_and_return_path(xblock_extractor):
 
         for xblock_extractor in self.scraper.xblock_extractor_objects:
             if (xblock_extractor.xblock_json["block_id"] == target_path.parts[-1]) or (
-                urllib.parse.urlparse(xblock_extractor.xblock_json["lms_web_url"]).path
+                requests.utils.urlparse(
+                    xblock_extractor.xblock_json["lms_web_url"]
+                ).path
                 == str(target_path)
             ):
                 # we have a path match, we now check xblock type to redirect properly
@@ -444,7 +450,7 @@ def check_descendants_and_return_path(xblock_extractor):
                 return check_descendants_and_return_path(xblock_extractor)
 
     def rewrite_internal_links(self, html_body, root_from_html, netloc):
-        """ rewrites internal links and ensures no root-relative links are left behind """
+        """rewrites internal links and ensures no root-relative links are left behind"""
 
         def update_root_relative_path(anchor, fixed_path, root_from_html, netloc):
             """updates a root-relative path to the fixed path in zim
@@ -456,12 +462,12 @@ def update_root_relative_path(anchor, fixed_path, root_from_html, netloc):
                 anchor.attrib["href"] = netloc + anchor.attrib["href"]
 
         anchors = html_body.xpath("//a")
-        path_prefix = f"{self.scraper.instance_config['course_prefix']}{urllib.parse.unquote_plus(self.scraper.course_id)}"
+        path_prefix = f"{self.scraper.instance_config['course_prefix']}{requests.utils.unquote(self.scraper.course_id)}"
         has_changed = False
         for anchor in anchors:
             if "href" not in anchor.attrib:
                 continue
-            src = urllib.parse.urlparse(anchor.attrib["href"])
+            src = requests.utils.urlparse(anchor.attrib["href"])
 
             # ignore external links
             if src.netloc and src.netloc != self.scraper.instance_url:
@@ -507,7 +513,7 @@ def dl_dependencies_and_fix_links(
         netloc=None,
         path_on_server="",
     ):
-        """ downloads all static dependencies from an HTML content, and fixes links """
+        """downloads all static dependencies from an HTML content, and fixes links"""
 
         if not netloc:
             netloc = self.scraper.instance_url
@@ -551,7 +557,7 @@ def dl_dependencies_and_fix_links(
         return content
 
     def defer_scripts(self, content, output_path, path_from_html):
-        """ defer all scripts in content. For inline scripts, they're placed in a *.js file and deferred """
+        """defer all scripts in content. For inline scripts, they're placed in a *.js file and deferred"""
 
         soup = BeautifulSoup(content, "lxml")
         script_tags = soup.find_all("script")
@@ -590,7 +596,8 @@ def extract_head_css_js(self, soup, output_path, path_from_html, root_from_html)
 
         output_path: a Path object to store the downloaded CSS/JS to
         path_from_html: a string representing the path to output_path from the resultant HTML
-        root_from_html: a string representing the path to the root from the resultant HTML"""
+        root_from_html: a string representing the path to the root from the resultant HTML
+        """
 
         html_headers = soup.find("head")
         head_css_js = (
@@ -622,7 +629,8 @@ def extract_body_end_scripts(
 
         output_path: a Path object to store the downloaded CSS/JS to
         path_from_html: a string representing the path to output_path from the resultant HTML
-        root_from_html: a string representing the path to the root from the resultant HTML"""
+        root_from_html: a string representing the path to the root from the resultant HTML
+        """
 
         html_body = soup.find("body")
         body_scripts = html_body.find_all("script", recursive=False)

From 7c2928fe5c9e6bf3299f0b1e67c70d936363dcbb Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Tue, 26 Dec 2023 11:46:29 +0530
Subject: [PATCH 3/7] modified instance_connection.py file to eliminate the
 uses of urllib library

---
 openedx2zim/instance_connection.py | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/openedx2zim/instance_connection.py b/openedx2zim/instance_connection.py
index 06ec06a..ac6efc0 100644
--- a/openedx2zim/instance_connection.py
+++ b/openedx2zim/instance_connection.py
@@ -3,7 +3,7 @@
 import http
 import json
 import sys
-import urllib
+import requests
 
 from .constants import getLogger, LANGUAGE_COOKIES, OPENEDX_LANG_MAP
 
@@ -25,17 +25,17 @@ def __init__(self, email, password, instance_config, locale, build_dir, debug):
         self.debug = debug
 
     def get_response(self, url, post_data, headers, max_attempts=5):
-        req = urllib.request.Request(url, post_data, headers)
+        req = requests.post(url, data=post_data, headers=headers)
         for attempt in range(max_attempts):
             try:
-                return urllib.request.urlopen(req).read().decode("utf-8")
-            except urllib.error.HTTPError as exc:
+                requests.urlopen(req).content.decode("utf-8")
+            except requests.exceptions.HTTPError as exc:
                 logger.debug(f"HTTP Error (won't retry this kind of error) while opening {url}: {exc}")
                 if self.debug:
                     responseData = exc.read().decode("utf8", 'ignore')
                     print(responseData, file=sys.stderr)
                 raise exc
-            except urllib.error.URLError as exc:
+            except requests.exceptions.RequestException as exc:
                 if attempt < max_attempts - 1:
                     logger.debug(f"Error opening {url}: {exc}\nRetrying ...")
                     continue
@@ -56,11 +56,9 @@ def update_csrf_token_in_headers(self):
         self.headers.update({"X-CSRFToken": csrf_token})
 
     def generate_connection_headers(self):
-        opener = urllib.request.build_opener(
-            urllib.request.HTTPCookieProcessor(self.cookie_jar)
-        )
+        opener = requests.Session().cookies.update(self.cookie_jar)
         opener.addheaders = [("User-Agent", "Mozilla/5.0")]
-        urllib.request.install_opener(opener)
+        requests.Session().cookies = opener.cookies
         opener.open(self.instance_config["instance_url"] + "/login")
         self.headers = {
             "User-Agent": "Mozilla/5.0",
@@ -74,9 +72,7 @@ def generate_connection_headers(self):
 
     def establish_connection(self):
         self.generate_connection_headers()
-        post_data = urllib.parse.urlencode(
-            {"email": self.email, "password": self.password, "remember": False}
-        ).encode("utf-8")
+        post_data = {"email": self.email, "password": self.password, "remember": False}
         # API login can also be used : /user_api/v1/account/login_session/
         self.instance_connection = self.get_api_json(
             self.instance_config["login_page"], post_data, max_attempts=1
@@ -122,5 +118,5 @@ def get_page(self, url):
 
     def get_redirection(self, url):
         self.update_csrf_token_in_headers()
-        req = urllib.request.Request(url, None, self.headers)
-        return urllib.request.urlopen(req).geturl()
+        response = requests.get(url, headers=self.headers, allow_redirects=False)
+        return response.headers.get('Location', response.url)

From 653cac244301e9bf91d72f53e97e6d7236fe298a Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Tue, 26 Dec 2023 11:48:12 +0530
Subject: [PATCH 4/7] updated utils.py to remove url lib library

---
 openedx2zim/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/openedx2zim/utils.py b/openedx2zim/utils.py
index 4c2fda2..bd31b9a 100644
--- a/openedx2zim/utils.py
+++ b/openedx2zim/utils.py
@@ -2,7 +2,6 @@
 import mimetypes
 import pathlib
 import re
-import urllib
 import shlex
 import subprocess
 import zlib
@@ -28,7 +27,7 @@ def prepare_url(url, netloc, path_on_remote=None):
         # add a scheme first to prevent wrong URL parsing
         if not url.startswith("http://") and not url.startswith("https://"):
             url = f"http://{url}"
-        parsed_url = urllib.parse.urlparse(url)
+        parsed_url = requests.utils.urlparse(url)
         if not parsed_url.netloc and path_on_remote:
             url = f"{netloc}{str(pathlib.Path(path_on_remote).joinpath(url))}"
     return url

From 5c756f2b0dcb73a065b04d3a0a7d1d06c768374b Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Tue, 26 Dec 2023 11:50:50 +0530
Subject: [PATCH 5/7] Changed problem.py to remove uses of urllib library

---
 openedx2zim/xblocks_extractor/problem.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/openedx2zim/xblocks_extractor/problem.py b/openedx2zim/xblocks_extractor/problem.py
index 8cc3620..f98b073 100644
--- a/openedx2zim/xblocks_extractor/problem.py
+++ b/openedx2zim/xblocks_extractor/problem.py
@@ -1,7 +1,6 @@
 import json
 import uuid
-import itertools
-import urllib
+import itertools 
 
 from bs4 import BeautifulSoup
 
@@ -65,8 +64,6 @@ def check_answer(self, answer_candidate, instance_connection):
                 }
             )
 
-        # encode the payload as a byte string
-        post_data = urllib.parse.urlencode(post_data).encode("utf-8")
 
         # send a POST request to the instance with the payload
         return instance_connection.get_api_json(

From f794e3442517cfff79dd459230ba74e16238b500 Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Tue, 26 Dec 2023 11:56:07 +0530
Subject: [PATCH 6/7] updated video.py to replace urllib by requests, formatted
 using black and removed unused import

---
 openedx2zim/xblocks_extractor/video.py | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/openedx2zim/xblocks_extractor/video.py b/openedx2zim/xblocks_extractor/video.py
index f721586..cf82955 100644
--- a/openedx2zim/xblocks_extractor/video.py
+++ b/openedx2zim/xblocks_extractor/video.py
@@ -1,11 +1,11 @@
 import json
 import re
-import urllib
+import requests
 
 from bs4 import BeautifulSoup
 
 from .base_xblock import BaseXblock
-from ..utils import jinja, download_and_convert_subtitles, prepare_url, get_back_jumps
+from ..utils import jinja, download_and_convert_subtitles, get_back_jumps
 from ..constants import getLogger
 
 
@@ -64,7 +64,7 @@ def prepare_download_view_data(self, instance_connection):
             and "url"
             in self.xblock_json["student_view_data"]["encoded_videos"]["fallback"]
         ):
-            self.url = urllib.parse.unquote(
+            self.url = requests.utils.unquote(
                 self.xblock_json["student_view_data"]["encoded_videos"]["fallback"][
                     "url"
                 ]
@@ -74,10 +74,10 @@ def prepare_download_view_data(self, instance_connection):
             and "url"
             in self.xblock_json["student_view_data"]["encoded_videos"]["mobile_low"]
         ):
-            self.url = urllib.parse.unquote(
-                self.xblock_json["student_view_data"]["encoded_videos"][
-                    "mobile_low"
-                ]["url"]
+            self.url = requests.utils.unquote(
+                self.xblock_json["student_view_data"]["encoded_videos"]["mobile_low"][
+                    "url"
+                ]
             )
         elif (
             "youtube" in self.xblock_json["student_view_data"]["encoded_videos"]
@@ -117,9 +117,7 @@ def prepare_download_view_data(self, instance_connection):
             else:
                 self.no_video = True
                 logger.error(
-                    "Cannot get video for {}".format(
-                        self.xblock_json["lms_web_url"]
-                    )
+                    "Cannot get video for {}".format(self.xblock_json["lms_web_url"])
                 )
                 logger.error(self.xblock_json)
                 self.add_failed({"url": self.xblock_json["lms_web_url"]})
@@ -139,7 +137,10 @@ def download_video(self, instance_connection):
                 if not success:
                     self.add_failed({"url": self.url})
             else:
-                prepared_url = prepare_url(urllib.parse.unquote(self.url), self.scraper.instance_url)
+                prepared_url = requests.utils.unquote(self.url)
+                prepared_url = requests.utils.urljoin(
+                    self.scraper.instance_url, prepared_url
+                )
                 success = self.scraper.download_file(
                     prepared_url,
                     video_path,

From a307014a50dd88295c4947fd4015828b80b44c6c Mon Sep 17 00:00:00 2001
From: mucchu <harshwasnik160602@gmail.com>
Date: Tue, 26 Dec 2023 15:09:52 +0530
Subject: [PATCH 7/7] Updated annex.py to import individual items from
 collections to support the the updated package

---
 openedx2zim/annex.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/openedx2zim/annex.py b/openedx2zim/annex.py
index 9b29a1b..a016351 100644
--- a/openedx2zim/annex.py
+++ b/openedx2zim/annex.py
@@ -3,7 +3,7 @@
 import uuid
 import json
 import pathlib
-import collections
+from collections import OrderedDict, defaultdict
 
 from bs4 import BeautifulSoup
 
@@ -17,7 +17,7 @@ class MoocForum:
     def __init__(self, scraper):
         self.scraper = scraper
         self.threads = []
-        self.categories = collections.OrderedDict()
+        self.categories = OrderedDict()
         self.staff_user = []
         self.output_path = self.scraper.build_dir.joinpath("forum")
         self.output_path.mkdir(parents=True, exist_ok=True)
@@ -219,7 +219,7 @@ def annex_forum(self):
             self.update_thread_children(thread)
 
     def render_forum(self):
-        thread_by_category = collections.defaultdict(list)
+        thread_by_category = defaultdict(list)
         for thread in self.threads:
             thread_by_category[thread["commentable_id"]].append(thread)
         jinja(