-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctionDefs.py
203 lines (173 loc) · 7.52 KB
/
functionDefs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
## Function definitions for the physics RSS feed reader.
from keywords import *
import re
import feedparser
#from subprocess import call
import smtplib # Import smtplib for the actual sending function
import time
import datetime as dt
from email.mime.text import MIMEText # Import the email modules we'll need
from unicodedata import normalize
def makeUnicode(input):
if type(input) != unicode:
input = input.decode('utf-8','ignore')
return input
else:
return input
def myFormat(text):
#Remove punctuation, capitalization for checking for matches to keywords.
return(re.sub(r'[^\w\s]','',text.lower()))
cleanr = re.compile('<.*?>')
def myAuthorFormat(text):
if text in ['',u'',u' ']:
return('')
#Remove punctuation, capitalization for checking for matches to keywords.
text = text.replace('.', ' ')#sometimes initials aren't separated by spaces in listings
text = text.replace(' ',' ')#remove any double spaces created by previous line
formatted = re.sub(r'[^\w\s]','',text.lower())
formatted = formatted.strip()
nameList = formatted.split(' ')
firstInitLastNameOnly = ''
try:
firstInitLastNameOnly = nameList[0][0] + ' '+nameList[-1]
except:
pass
return(firstInitLastNameOnly)
def passes_filter(entry,latestExecutionDate, getVeryInterestingEntries):
#Get author names from RSS entry, if possible... turns out the general case is not well defined and thus hard to handle!
try:
entryAuthors = entry.authors #this line may throw an attribute error
#print "debug: entryAuthors = ", entryAuthors
#print entryAuthors[0]
if {} in entryAuthors:
#print "debug 1"
raise AttributeError('') #or the attribute in the RSS entry just may not contain the right info
except AttributeError:
try:
entryAuthors = entry.author
except AttributeError:
#print "debug 2"
entryAuthors = [""]
if isinstance(entryAuthors,str) or isinstance(entryAuthors,unicode):
entryAuthors = [entryAuthors]
entryAuthors2 = []
for entryAuthor in entryAuthors:
# print entryAuthor
try:
entryAuthor = entryAuthor['name']
except:
pass
entryAuthor = re.sub(cleanr,'',entryAuthor) #strips html tags
entryAuthors2.extend(entryAuthor.split(',')) #break author lists apart by commas as needed, hopefully
entryAuthors2 = [myAuthorFormat(entryAuthor) for entryAuthor in entryAuthors2]
isVeryInteresting = any([myAuthorFormat(author) in entryAuthors2 for author in authors])
if isVeryInteresting:
isInteresting = False #helps avoid duplicates listings in the summary produced.
else:
isInteresting = (any([myFormat(word) in myFormat(entry.summary) for word in wordlist]) or any([myFormat(titleword) in myFormat(entry.title) for titleword in titlelist]))
#debugging:
# if isVeryInteresting or isInteresting:
# print "Very interesting =",isVeryInteresting
# print [author for author in entryAuthors]
# print [myFormat(author) for author in entryAuthors]
isArxiv=False
#Get date of RSS entry, if possible
try:
temp = entry.updated_parsed
except AttributeError:
# Only the Arxiv feed is known not to give updated_parsed values (ie, dates for each entry).
# Since the Arxiv RSS is refreshed daily anyway, we will assume these entries are new (except on weekends).
isArxiv = True
if isArxiv:
isWeekday = dt.datetime.today().weekday() < 5
isJustAnUpdate = entry.title[-8:-1] == "UPDATED"
isNew = isWeekday and not isJustAnUpdate
else:
[yearStr,monStr,dayStr] = [str(num) for num in temp[0:3]]
if len(dayStr) == 1:
dayStr = '0'+dayStr
if len(monStr) == 1:
monStr = '0'+monStr
pubDate = int(yearStr+monStr+dayStr)
isNew = pubDate >= latestExecutionDate
#if not isNew:
#print "debug -- not new:"
#print pubDate
if getVeryInterestingEntries:
return(isVeryInteresting and isNew)
else:
return(isInteresting and isNew)
def strip_html(text):
return re.sub('<[^<]+?>', '', text)
def unicodeToAscii(uStr):
try:
#print "debug: just encoded in ascii..."
return(normalize('NFKD',uStr).encode('ascii','ignore'))
except Exception as e1:
#print "error1: "+str(e1)
try:
#print "debug: just ran str()"
return(str(uStr))
except Exception as e2:
#print "error2: "+str(e2)
#print "giving up unicode to ascii conversion"
#print type(uStr)
return(uStr)
def executiveSummary(feedURL,feedNameString,latestExecutionDate,listOfArxivIDsSoFar,getVeryInterestingEntries):
print "Now checking: "+feedNameString
feed = feedparser.parse(feedURL)
filtered_entries = [entry for entry in feed.entries if passes_filter(entry,latestExecutionDate, getVeryInterestingEntries)]
#print "filtered entries:"
#print filtered_entries
numFilteredEntriesWithDuplicates = len(filtered_entries)
if numFilteredEntriesWithDuplicates == 0:
return('',0,listOfArxivIDsSoFar)
if 'ARXIV' in feedNameString.upper():
## Remove duplicate entries (eg, entries listed in both Q. Phys. and Cond.Mat.):
FEdex=0
dexesOfDuplicates=[]
for entry in filtered_entries:
thisTitle = strip_html(entry.title)
idStrStartDex = thisTitle.find('(arXiv:') + len('(arXiv:')
idStrStopDex = thisTitle.find(' ', idStrStartDex)
thisArxivID = thisTitle[idStrStartDex:idStrStopDex]
if thisArxivID in listOfArxivIDsSoFar:
dexesOfDuplicates.append(FEdex)
else:
listOfArxivIDsSoFar.append(thisArxivID)
FEdex += 1
filtered_entries = [v for i, v in enumerate(filtered_entries) if i not in dexesOfDuplicates]
numFilteredEntries = len(filtered_entries)
if numFilteredEntries == 0:
return('',0,listOfArxivIDsSoFar)
## Assemble the large summary string:
largestring = "--------------- "+feedNameString+" ---------------"
largestring = largestring+"\n"
for entry in filtered_entries:
try:
largestring += "\n" + strip_html(entry.title) + "\n \n \t" + strip_html(entry.author) + "\n \t" + entry.link + "\n \n" + strip_html(entry.description) + "\n"
except AttributeError:
try:
largestring += "\n" + strip_html(entry.title) + "\n \n \t" + entry.link + "\n \n" + strip_html(entry.description) + "\n"
except AttributeError:
pass
return(unicodeToAscii(largestring),numFilteredEntries,listOfArxivIDsSoFar)
def sendSummaryEmails(recipientList,message,numVIentries,numOtherEntries):
sender = '[email protected]'
username = '[email protected]' # for gmail include "@gmail.com"
powned = '******' # thePwfor your account
s = smtplib.SMTP('example.com:587')
s.ehlo()
s.starttls()
s.login(username,powned)
# msg['Subject'] = "cQED Digest "+str(time.strftime("%m/%d/%Y"))+" ["+str(numVIentries)+","+str(numOtherEntries)+"]"
# msg['From'] = sender
# #msg['To'] = "" #", ".join(recipientList)
# s.sendmail(sender, recipientList, msg.as_string())
for recipient in recipientList:
msg = MIMEText(message.encode('utf-8'))
msg['Subject'] = "cQED Digest "+str(time.strftime("%m/%d/%Y"))+" ["+str(numVIentries)+","+str(numOtherEntries)+"]"
msg['From'] = sender
msg['To'] = recipient
s.sendmail(sender, [recipient], msg.as_string())
s.quit()