-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathretrieve_articles.py
124 lines (116 loc) · 6.02 KB
/
retrieve_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from lib.commons import *
from lib.profiling import *
from lib.regex import *
from lib.aliases import update_aliases
from lib.summarizer import *
def clean_webpages(websites):
articles = []
for listentry in websites:
try:
website_content = BeautifulSoup(listentry.content, 'html.parser')
# if "main" tags are not found use "body"
if website_content.find('main'):
website_content = website_content.find('main')
else:
website_content = website_content.find('body')
# find all a-tags
atags = website_content.find_all('a')
# clean a-tags
for entry in atags:
cleaned_result = entry.get('href')
headline = ""
# find headline
if len(re.findall(HEADLINE_HEADER, str(entry))) > 0:
headline = re.findall(HEADLINE_HEADER, str(entry))[0]
elif len(re.findall(HEADLINE_H1, str(entry))) > 0:
hl = str(re.findall(HEADLINE_H1, str(entry)))
headline = hl.replace("[", "").replace("]", "")
elif len(re.findall(HEADLINE_SPAN, str(entry))) > 0:
headline = re.findall(HEADLINE_SPAN, str(entry))[0]
elif len(re.findall(HEADLINE_DIV, str(entry))) > 0:
headline = re.findall(HEADLINE_DIV, str(entry))[0]
elif len(re.findall(HEADLINE_A, str(entry))) > 0:
headline = re.findall(HEADLINE_A, str(entry))[0]
# if headline still has not been set before
if headline == "":
if not cleaned_result.startswith('http'):
headline = cleaned_result.strip("/").replace('-', ' ').replace('.html', '').title()
else:
headline = cleaned_result.strip("/").split("/")[len(cleaned_result.strip("/").split("/"))-1].replace('-', ' ').replace('.html', '').title()
# remove unnecessary extracted entries
if not any(stopword.upper() in cleaned_result.upper() for stopword in CONFIG['websiteconfig']['stopwords']):
# add webpage prefix if not already contained in url
if cleaned_result.strip().startswith("/"):
cleaned_result = str(listentry.url).replace("/fachbeitraege/", '') + cleaned_result.strip()
# add cleaned and filtered entry to result if it is not yet there
if not cleaned_result in str(articles):
articles.append(cleaned_result.strip() + "|" + str(headline))
except Exception as e:
print('Error cleaning webpage content for webpage ', str(listentry.url), '. Error Message: ', str(e))
return articles
def format_result(all_articles):
result_str = ""
# remove leftovers
try:
articles = []
for url in CONFIG['websiteconfig']['webpages']:
page = url.rsplit(".", 1)[0]
for x in all_articles:
if x.startswith(page):
articles.append(x)
except Exception as e:
print('Error cleaning result ', str(all_articles), '. Error Message: ', str(e))
exit()
# concatenate list entries to result string
for entry in articles:
# special handling: record is only added if either the key is not contained or the key AND value are both contained
# loop through specialhandling dictionary
for key, value in CONFIG['websiteconfig']['specialHandling'].items():
# if the entry contains the key, but does not start with the repective value, exit the loop and process the next entry
if key in entry and not entry.startswith(value):
break
# if the loop has not yet been stopped due to special handling
else:
# build and format record for each article to be added to the result
new_record = keep_delta(entry.split("|")[1] + '\n' + entry.split("|")[0].replace('security//news/', '/news/').replace('theregister.com/security//', 'theregister.com/').replace('/blog/blog/', '/blog/').replace('https://www.darkreading.com//', 'https://www.darkreading.com/') + '\n\n' )
result_str += new_record
return result_str
def main():
print('\n')
print('++++++++++++++++++++++++++++++++++ ARTICLE LINKS SCRIPT START ++++++++++++++++++++++++++++++++++')
# get urls from config.json
print('Getting URLS from config')
url_list = get_urls()
# call webpages
print('Calling webpages as given in config.json')
websites = get_webpage(url_list)
# clean webpage content and format result
print('Cleaning webpage contents')
articles = clean_webpages(websites)
result = format_result(articles)
# write file and/or send as mail depending on config
if CONFIG['resultfile']['createFile']:
print('Writing result file')
write_file(result)
if CONFIG['mailconfig']['sendMail']:
print('Sending mail')
# get and send summaries if specified in config
if CONFIG['mailconfig']['sendSummary']:
webpage_contents = parse_websites(result.split("\n\n"))
result_summary = summarize(webpage_contents)
send_mail(result_summary, "html")
else:
send_mail(result, "")
if CONFIG['profileRecords']:
print('Updating profile records')
profiling_records(result)
# Checking if aliases should be updated
if CONFIG['profiling']['profile2cortex']:
alias_update = input("Want to update the cortex_aliases.config file (Y/n)? ")
if alias_update.upper() in ["Y", "YES"]:
update_aliases()
# Checking for older files which will not be needed anymore
delete_old_files()
print('+++++++++++++++++++++++++++++++++++ SCRIPT END +++++++++++++++++++++++++++++++++++')
if __name__ == '__main__':
main()