-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathweb_pull.py
72 lines (59 loc) · 1.96 KB
/
web_pull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from GA_ondn import catch_data, file_to_string, clear_subcategories
import urllib.request
import re
import sys
import html.parser
def get_article_list(filename):
''' (str) -> tuple
Returns a tuple containing a part of url and number of pageviews
extracted from the csv file.
'''
text = file_to_string(filename)
title, date_range, articles = catch_data(text)
articles = clear_subcategories(articles)
return articles
def get_urls(filename):
''' (str) -> list of str
Returns a list of www.dnevnik.si full urls.
'''
li = get_article_list(filename)
urls = [('http://www.dnevnik.si') + x[0] for x in li]
return urls
def fetch_title(url):
''' (str) -> str
Gets the source code of a webpage of a given url and returns
the title of the page, extracted with regular expressions.
'''
response = urllib.request.urlopen(url)
html = response.read()
text = html.decode('utf-8')
match = re.search("<title>(.+)<", text)
return match.group(1)
def get_titles(articles):
''' (str) -> list of str
Returns the list of webpage titles from the urls in the csv file.
'''
h = html.parser.HTMLParser()
articles = [('http://www.dnevnik.si') + x[0] for x in articles]
titles_non = [fetch_title(x) for x in articles]
titles = [h.unescape(x) for x in titles_non]
return titles
def get_pageviews(li):
''' (str) -> list of str
Returns a list of pageviews for each of the urls from csv file.
'''
pageviews = [x[1] for x in li]
return pageviews
if __name__ == "__main__":
if len(sys.argv) < 2:
print('Please, provide the input file.')
sys.exit()
# filename = input("Path to file:")
filename = sys.argv[1]
articles = get_article_list(filename)
# titles = get_titles(filename)
titles = get_titles(articles)
pageviews = get_pageviews(articles)
titles_pageviews = zip(titles, pageviews)
for x in titles_pageviews:
print(x)