-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFirst Crawler Self Built.py
88 lines (63 loc) · 1.95 KB
/
First Crawler Self Built.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
"""
Created on Wed Aug 12 13:48:53 2020
@author: PC
"""
import re
from urllib.parse import urlparse
import requests
from Links import *
import csv
main_lk = L_1
print(main_lk)
bucket = []
class Crawler():
def __init__(self, inilink, stop):
self.inilink = inilink
self.checkpoint = set()
self.stop = stop
self.count = 0
def html(self,url):
try:
html = requests.get(url)
except Exception as e:
print(e)
return ""
return html.text
def get_link(self,url):
html = self.html(url)
r_parse = urlparse(url)
base_lk = r_parse.scheme +'://'+r_parse.netloc
links = list(set(re.findall('href="([-+./:\w]+)"', html)))
for x in range(len(links)):
parse = urlparse(links[x])
if not parse.netloc:
new_lk = base_lk + links[x]
links[x] = new_lk
return links
# def get_info(self,url):
# html = html(url)
# bucket.append()
def csv(self,url):
print(url)
with open('links.csv', 'a', newline='') as file:
writer = csv.writer(file)
writer.writerow([url])
def crawl(self, url):
l_lk = self.get_link(url)
for link in l_lk:
self.count = self.count + 1
if link in self.checkpoint:
continue
if self.count < self.stop:
self.csv(link)
#print(link)
self.checkpoint.add(link)
self.crawl(link)
else:
continue
def start(self):
self.crawl(self.inilink)
if __name__== "__main__":
crawler = Crawler(main_lk,10000)
crawler.start()