-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhelper_functions.py
78 lines (54 loc) · 2.26 KB
/
helper_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
# helper function for transcript processing
import operator
# read in all the lines from the transcript
def transcript_to_wordcount(transcripts, candidateList):
wordDict = dict()
for candidate in candidateList:
wordDict[candidate] = dict()
for line in transcripts:
# check to see if the first word denotes a speaker
firstWord = line.split(' ', 1)[0]
if len(firstWord) > 0 and firstWord[-1] == ':':
# indicate who is now speaking
currentSpeaker = firstWord[:-1]
line = line.split(' ', 1)[1]
# if the speaker is a candidate, tally the words in the sentence
if currentSpeaker in candidateList:
trimmedLine = line.translate(None, ",.!?\n").lower()
for word in trimmedLine.split(' '):
if word.isspace():
pass
elif word in wordDict[currentSpeaker].keys():
wordDict[currentSpeaker][word] += 1
else:
wordDict[currentSpeaker][word] = 1
return wordDict
def sort_by_frequency(wordDict, candidateList):
wordList = dict()
totalWords = 0
totalDict = {}
for candidate in candidateList:
# convert word dictionary to list of tuples so we can sort it
wordList[candidate] = sorted(wordDict[candidate].items(),
key=operator.itemgetter(1))
# tally all the occurences of words
for pair in wordList[candidate]:
totalWords += pair[1]
if pair[0] in totalDict.keys():
totalDict[pair[0]] += pair[1]
else:
totalDict[pair[0]] = pair[1]
return wordList, totalDict, totalWords
def transcript_to_candidate_lines(transcripts, candidateList):
sentenceList = dict()
for candidate in candidateList:
sentenceList[candidate] = []
for line in transcripts:
firstWord = line.split(' ', 1)[0]
if len(firstWord) > 0 and firstWord[-1] == ':':
currentSpeaker = firstWord[:-1]
line = line.split(' ', 1)[1]
if currentSpeaker in candidateList:
trimmedLine = line.translate(None, "\n")
sentenceList[currentSpeaker].append(trimmedLine)
return sentenceList