-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_bs4_MAIN.py
333 lines (276 loc) · 12.7 KB
/
scraper_bs4_MAIN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
#PHist 1.1 web scraper
#Scrapes sites with BeautifulSoup4
#MAIN PROGRAM
import codecs, requests, json, datetime, time, os
import sys, getopt
import robotexclusionrulesparser
from urllib.parse import urljoin
import scraper_module as sm
import scraper_globals as sg
from bs4 import BeautifulSoup
from collections import OrderedDict
#___SETUP:___
#COMMAND LINE ARGUMENTS
'''
Give the seedfile name as a command line argument:
[-s] [seed file name]
This file should be in plain text format (.TXT with encoding = utf-8).
DO NOT USE Microsoft Word documents or equivalent, because
they usually include some metadata that screws things up.
Give the language tag for filenames:
[-l] [language tag]
For example ENG or FIN or RUS.
Give the level tag for filenames:
[-L] [level tag]
For example SEED or L2 or L3.
A subdirectory by this name will be created for the files.
Give a prefix:
[script name] [-p] [prefix name]
Prefix is used to track files for each partial cycle.
A subdirectory by this name will be created for the files.
'''
#Reads commandline arguments
fullCmdArguments = sys.argv
argumentList = fullCmdArguments[1:]
#Defines valid parameters
unixOptions = "hs:l:L:p:"
gnuOptions = ["help", "seedfile=", "language=", "level=", "prefix="]
#Parses the argument list
try:
arguments, values = getopt.getopt(argumentList, unixOptions, gnuOptions)
except getopt.error as err:
# output error, and return with an error code
print (str(err))
sys.exit(2)
#Goes through arguments and takes actions accordingly
for currentArgument, currentValue in arguments:
if currentArgument in ("-h", "--help"):
print ('''Syntax:
[script name] [-s] [seed file name]
[script name] [-l] [language tag, e.g. FIN or RUS]
[script name] [-L] [scrape level tag: SEED, L2 or L3]
[script name] [-p] [optional prefix tag]''')
elif currentArgument in ("-s", "--seedfile"):
print (("Seedfile to be used is (%s)") % (currentValue))
sg.seedfile = currentValue
elif currentArgument in ("-l", "--language"):
print (("Language of scraped files is (%s)") % (currentValue))
sg.language = currentValue
elif currentArgument in ("-L", "--level"):
print (("Scrape level is (%s)") % (currentValue))
sg.level = currentValue
elif currentArgument in ("-p", "--prefix"):
print (("Prefix for output files is (%s)") % (currentValue))
sg.prefix = currentValue
#Creates time stamp for files
now = datetime.datetime.now()
t = now.strftime("%Y-%m-%d-%H%M%S")
#Make directories if needed
##Current working directory
cwd = os.getcwd()
print(cwd)
##Sets access rights for the directories
accessrights = 0o755 # 755 = readable by all, writeable by owner only
# 777 = readable and writeable by all
##Creates the required directories and defines where the files will be saved
resultpath = cwd + '/results'
resultpath2 = resultpath + '/' + sg.level
prefixpath = ''
savepath = resultpath2
if not os.path.exists(resultpath):
try:
os.mkdir(resultpath, accessrights)
except:
print('Creation of the directory %s failed' % resultpath)
else:
print('Successfully created the path %s' % resultpath)
if not os.path.exists(resultpath2):
try:
os.mkdir(resultpath2, accessrights)
except:
print('Creation of the directory %s failed' % resultpath2)
else:
print('Successfully created the path %s' % resultpath2)
if sg.prefix != '': #if a prefix was given, creates a subdirectory for it and saves there
prefixpath = resultpath2 + '/' + sg.prefix
if not os.path.exists(prefixpath):
try:
os.mkdir(prefixpath, accessrights)
except:
print('Creation of the directory %s failed' % prefixpath)
else:
print('Successfully created the path %s' % prefixpath)
savepath = prefixpath
#Log file name
logf = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_LOG.txt'
print('The log file will be named ' + logf)
#Cycle for handling all links
def cycle(level, links_to_be_scraped, link_repository, log):
print('Iterating ' + level + ' level links...', file = log)
log.flush()
iter_num = 0
for link in links_to_be_scraped:
print('\r\nURL: ' + str(link), file = log)
log.flush()
#Checks if link has already been processed before
if sm.check_seen(link) == True:
print(' Seen this link before - did not scrape.', file = log)
log.flush()
continue
#Discards YouTube links
if 'youtube.com' in link:
print(' YouTube link found - did not scrape.', file = log)
log.flush()
continue
#Discards mailto: links
if 'mailto:' in link:
print(' Mailto link found - did not scrape.', file = log)
log.flush()
continue
#Collects Facebook links for future use but does not scrape
if 'facebook.com' in link:
print(' Facebook link found - stored but did not scrape.', file = log)
log.flush()
sg.facebook_links.append(link)
#CollectsF Twitter links for future use but does not scrape
if 'twitter.com' in link:
print(' Twitter link found - stored but did not scrape.', file = log)
log.flush()
sg.twitter_links.append(link)
#Collects VK links for future use but does not scrape
if 'vk.com' in link:
print(' VK link found - stored but did not scrape.', file = log)
log.flush()
sg.vk_links.append(link)
#Collects Linkedin links for future use but does not scrape
if 'linkedin.com' in link:
print(' Linkedin link found - stored but did not scrape.', file = log)
log.flush()
sg.linkedin_links.append(link)
#Checks ROBOTS.TXT; if not allowed to scrape, moves to next link
print(' Checking robots.txt...', file=log)
log.flush()
answer, err = sm.checkRobotstxt(link)
if err == (True):
print(' Problem with reading ROBOTS.TXT - did not scrape.', file = log)
log.flush()
sg.robotstxt_not_found.append(link)
continue
elif err == False and answer == (False):
print(' The website does not allow the scraping of this link.', file = log)
log.flush()
sg.disallowed.append(link)
sg.seen_already.add(link)
continue
else:
print(' Scraping allowed!', file = log)
log.flush()
#Collects page contents; if no content found, moves to next link
page_content = sm.soup(link, log)
print(' Got page contents!', file = log)
log.flush()
if page_content == 0:
print(' Something went wrong or no page content found.', file = log)
sg.not_found.append(link)
sg.seen_already.add(link)
continue
elif page_content == 1:
print(' Connection error / no page content found.', file = log)
sg.not_found.append(link)
sg.seen_already.add(link)
continue
else:
#Keeps track of file
iter_num += 1
file_num = sm.numFile(iter_num)
full_filename = sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_' + str(file_num) + '.json'
print('File number: ' + sg.prefix + '_' + file_num)
print('File number: ' + sg.prefix + '_' + file_num, file = log)
#Records data
url = str(link) #records current url
a = sm.scr_date(log) #records current date and time
b = sm.pub_date(page_content, link, log) #records text publish date
c = sm.author(url, log) #records author
d = list(sm.links(page_content, link, log)) #records new found links as a list (modify sm.links to write collected links into a separate file 'links_out.txt')
e = sm.text(page_content, log) #records all text inside p-tags as one string
f, num_of_comments = sm.collect_comments(page_content, link, log) #records comments as a dictionary and the number of comments
#Adds url to scraped list and seen list
sg.scraped_links.append(url)
sg.seen_already.add(url)
#Adds collected links to the next level link repository
for link in d:
link_repository.add(link)
#creates a dictionary of all the collected data except comments
x = sm.makeDict(sg.language, sg.level, sg.prefix, file_num, url, a, b, c, d, e)
#creates a JSON file of all collected data except comments
sm.makeFile(full_filename, x, log, savepath)
#If comments were found, creates a separate file for comments
if num_of_comments > 0:
comments_filename = full_filename + '_comments.json'
sm.makeFile(comments_filename, f, log)
#Flushes the log file
log.flush()
#suspend execution for one second
time.sleep(1)
#___MAIN_CODE___
print('Starting...')
#Starting time
start_time = time.time()
#Gets the seed links and cycles through them
log = open(logf, 'a', encoding='utf-8')
print('Opened log file...')
print('Getting seed links...', file = log)
log.flush()
seed_links = sm.get_seeds(sg.seedfile)
print('Got seed links!', file = log)
log.flush()
cycle(sg.level, seed_links, sg.links_L2, log)
log.close()
#Writes stats to files
outfile = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_outgoing_links_for_next_srape_cycle.txt'
fb_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_facebook_links.txt'
tw_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_twitter_links.txt'
vk_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_vkontakte_links.txt'
li_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_linkedin_links.txt'
sc_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_scraped_links.txt'
nf_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_not_found_links.txt'
da_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_disallowed_links.txt'
rp_file = savepath + '/' + sg.language + '_' + sg.level + '_' + sg.prefix + '_' + t + '_robotstxt_not_found_links.txt'
with open(outfile, 'w', encoding='utf-8', errors='replace') as of:
for i in sg.links_L2:
of.write(i+'\r\n')
with open(fb_file, 'w', encoding='utf-8', errors='replace') as fbf:
for i in sg.facebook_links:
fbf.write(i+'\r\n')
with open(tw_file, 'w', encoding='utf-8', errors='replace') as twf:
for i in sg.twitter_links:
twf.write(i+'\r\n')
with open(vk_file, 'w', encoding='utf-8', errors='replace') as vkf:
for i in sg.vk_links:
vkf.write(i+'\r\n')
with open(li_file, 'w', encoding='utf-8', errors='replace') as lif:
for i in sg.linkedin_links:
lif.write(i+'\r\n')
with open(sc_file, 'w', encoding='utf-8', errors='replace') as scf:
for i in sg.scraped_links:
scf.write(i+'\r\n')
with open(nf_file, 'w', encoding='utf-8', errors='replace') as nff:
for i in sg.not_found:
nff.write(i+'\r\n')
with open(da_file, 'w', encoding='utf-8', errors='replace') as daf:
for i in sg.disallowed:
daf.write(i+'\r\n')
with open(rp_file, 'w', encoding='utf-8', errors='replace') as rpf:
for i in sg.robotstxt_not_found:
rpf.write(i+'\r\n')
with open(logf, 'a', encoding='utf-8') as log:
#Prints out final stats
print('*'*30, file = log)
print('Scraped ' + str(len(sg.scraped_links)) + ' urls.', file = log)
print('Could not scrape ' + str(len(sg.not_found)+len(sg.disallowed)) + ' urls.', file = log)
print(str(len(sg.seen_already)) + ' unique links processed.', file = log)
#Prints out elapsed time
elapsed_time = time.time() - start_time
showtime = str(datetime.timedelta(seconds=elapsed_time))
print('Time it took to execute this program: ' + str(showtime), file = log)
print('\r\nDone!')