Skip to content

Commit

Permalink
bs.py: Added Exception Handling and Persistence Support.
Browse files Browse the repository at this point in the history
  • Loading branch information
piedpiper36 committed Oct 19, 2023
1 parent a03f290 commit c4cb382
Showing 1 changed file with 55 additions and 36 deletions.
91 changes: 55 additions & 36 deletions src/bs.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,42 +9,61 @@
def scrape_root_url(start_url, dir_name):
to_visit = []
touched_urls = set()
to_visit.append(start_url)
touched_urls.add(start_url)
print(to_visit)
if os.path.exists('touched_urls.txt') and os.path.exists('to_visit.txt'):
# Read touched_urls
with open('touched_urls.txt', 'r') as f:
lines = []
for line in f:
touched_urls.add(line.strip())
# Read tovisit links
with open('to_visit.txt', 'r') as f:
lines = []
for line in f:
to_visit.append(line.strip())
else:
to_visit.append(start_url)
touched_urls.add(start_url)

while to_visit:
linkToVisit = to_visit.pop(0)
print(linkToVisit, '==== More to go : ' + str(len(to_visit)))
response = requests.get(linkToVisit)
source = BeautifulSoup(response.text, "html.parser")
divsFound = source.find_all("div", {"class": "d-lg-flex"})
if len(divsFound) < 2:
continue
div = divsFound[1]
linksFound = div.find_all('a')
for link in linksFound:
if link.has_attr("href"):
href = link["href"]
if href[0] == "#" and len(href) > 1 and '#' not in linkToVisit:
new_link = linkToVisit + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
if href[0] == "?" and len(href) > 1:
new_link = linkToVisit + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
elif href[0] == "/":
new_link = start_url + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
try:
while to_visit:
linkToVisit = to_visit.pop(0)
print(linkToVisit, '==== More to go : ' + str(len(to_visit)))
response = requests.get(linkToVisit)
source = BeautifulSoup(response.text, "html.parser")
divsFound = source.find_all("div", {"class": "d-lg-flex"})
if len(divsFound) < 2:
continue
div = divsFound[1]
linksFound = div.find_all('a')
for link in linksFound:
if link.has_attr("href"):
href = link["href"]
if href[0] == "#" and len(href) > 1 and '#' not in linkToVisit:
new_link = linkToVisit + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
if href[0] == "?" and len(href) > 1:
new_link = linkToVisit + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)
elif href[0] == "/":
new_link = start_url + href
if new_link not in touched_urls:
to_visit.append(new_link)
touched_urls.add(new_link)

end_point = linkToVisit.removeprefix(start_url)
# with open('somefile.txt', 'a') as the_file:
# the_file.write(linkToVisit + '\n')
end_point = linkToVisit.removeprefix(start_url)
with open('touched_urls.txt', 'a') as the_file:
the_file.write(linkToVisit + '\n')

pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
pdfkit.from_url(linkToVisit, pdf_name)
pdf_name = os.path.join(dir_name, url_to_str(end_point) + ".pdf")
pdfkit.from_url(linkToVisit, pdf_name)
except Exception:
print('Exception Occured')
finally:
# Touched urls are already saved.
# Now try to save to_visit urls.
with open('to_visit.txt','w') as file:
file.write('\n'.join(to_visit))

0 comments on commit c4cb382

Please sign in to comment.