bs.py: Added Exception Handling and Persistence Support.

JMkrish · Oct 19, 2023 · c4cb382 · c4cb382
1 parent a03f290
commit c4cb382
Showing 1 changed file with 55 additions and 36 deletions.
diff --git a/src/bs.py b/src/bs.py
@@ -9,42 +9,61 @@
 def scrape_root_url(start_url, dir_name):
     to_visit = []
     touched_urls = set()
-    to_visit.append(start_url)
-    touched_urls.add(start_url)
-    print(to_visit)
+    if os.path.exists('touched_urls.txt') and os.path.exists('to_visit.txt'):
+        # Read touched_urls
+        with open('touched_urls.txt', 'r') as f:
+            lines = []
+            for line in f:
+                touched_urls.add(line.strip())
+        # Read tovisit links
+        with open('to_visit.txt', 'r') as f:
+            lines = []
+            for line in f:
+                to_visit.append(line.strip())
+    else:
+        to_visit.append(start_url)
+        touched_urls.add(start_url)
 
-    while to_visit:
-        linkToVisit = to_visit.pop(0)
-        print(linkToVisit, '==== More to go : ' + str(len(to_visit)))
-        response = requests.get(linkToVisit)
-        source = BeautifulSoup(response.text, "html.parser")
-        divsFound = source.find_all("div", {"class": "d-lg-flex"})
-        if len(divsFound) < 2:
-            continue
-        div = divsFound[1]
-        linksFound = div.find_all('a')
-        for link in linksFound:
-            if link.has_attr("href"):
-                href = link["href"]
-                if href[0] == "#" and len(href) > 1 and '#' not in linkToVisit:
-                    new_link = linkToVisit + href
-                    if new_link not in touched_urls:
-                        to_visit.append(new_link)
-                        touched_urls.add(new_link)
-                if href[0] == "?" and len(href) > 1:
-                    new_link = linkToVisit + href
-                    if new_link not in touched_urls:
-                        to_visit.append(new_link)
-                        touched_urls.add(new_link)
-                elif href[0] == "/":
-                    new_link = start_url + href
-                    if new_link not in touched_urls:
-                        to_visit.append(new_link)
-                        touched_urls.add(new_link)
+    try:
+        while to_visit:
+            linkToVisit = to_visit.pop(0)
+            print(linkToVisit, '==== More to go : ' + str(len(to_visit)))
+            response = requests.get(linkToVisit)
+            source = BeautifulSoup(response.text, "html.parser")
+            divsFound = source.find_all("div", {"class": "d-lg-flex"})
+            if len(divsFound) < 2:
+                continue
+            div = divsFound[1]
+            linksFound = div.find_all('a')
+            for link in linksFound:
+                if link.has_attr("href"):
+                    href = link["href"]
+                    if href[0] == "#" and len(href) > 1 and '#' not in linkToVisit:
+                        new_link = linkToVisit + href
+                        if new_link not in touched_urls:
+                            to_visit.append(new_link)
+                            touched_urls.add(new_link)
+                    if href[0] == "?" and len(href) > 1:
+                        new_link = linkToVisit + href
+                        if new_link not in touched_urls:
+                            to_visit.append(new_link)
+                            touched_urls.add(new_link)
+                    elif href[0] == "/":
+                        new_link = start_url + href
+                        if new_link not in touched_urls:
+                            to_visit.append(new_link)
+                            touched_urls.add(new_link)
 
-        end_point = linkToVisit.removeprefix(start_url)
-        # with open('somefile.txt', 'a') as the_file:
-        #     the_file.write(linkToVisit + '\n')
+            end_point = linkToVisit.removeprefix(start_url)
+            with open('touched_urls.txt', 'a') as the_file:
+                the_file.write(linkToVisit + '\n')
 
-        pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
-        pdfkit.from_url(linkToVisit, pdf_name)
+            pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
+            pdfkit.from_url(linkToVisit, pdf_name)
+    except Exception:
+        print('Exception Occured')
+    finally:
+        # Touched urls are already saved.
+        # Now try to save to_visit urls.
+        with open('to_visit.txt','w') as file:
+	        file.write('\n'.join(to_visit))