-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWebsiteScrap.py
50 lines (39 loc) · 1.55 KB
/
WebsiteScrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import requests
from bs4 import BeautifulSoup
url = "https://www.learnpython.org/"
response = requests.get(url)
htmlContent = response.content
formatted_html_content = BeautifulSoup(htmlContent, 'html.parser')
# print(formatted_html_content)
# 1} Get the title of the HTML page
title = formatted_html_content.title
print(title)
# if you want only tag content
print(title.string)
# 2} find All anchor tag on this website and print count
list_anchors = formatted_html_content.find_all('a')
# print all anchor tags
print(list_anchors)
# print count
print("Number of anchor tags on this website : ", len(list_anchors))
# 3} Get first element in the HTML page
print(formatted_html_content.find('head'))
# 4} Get classes of any element in the HTML page
print(formatted_html_content.find('a')['class'])
# 5} find all the elements by class name
print(formatted_html_content.find_all("a", class_="navbar-brand"))
# 6} Get the text from the tags/soup
print(formatted_html_content.find("p").get_text())
# 7} Get all the anchor tags from the page with iteration
list_anchors = formatted_html_content.find_all('a')
all_links = set()
for link in list_anchors:
print(link) # get all anchor tag with links
print(link.get('href')) # get all links
all_links.add(link.get('href')) # want to remove duplicate links
print(all_links)
print(len(all_links))
# 8} find duplicate links
all_web_links_count=len(list_anchors)
after_remove_duplicate_links_count=len(all_links)
print('Number of duplicate links in this website are : ',all_web_links_count-after_remove_duplicate_links_count)