-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathngrams.py
76 lines (68 loc) · 2.38 KB
/
ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#import collections
import math, sys
#TODO: probability in one function!
def make_ngrams(plain_text, n):
ngram_array = []
array_text = list(plain_text)
for i in range(len(array_text) - n + 1):
ngram = []
for j in range(i,i+n):
ngram.append((array_text[j]))
ngram_array.append(tuple(ngram))
return ngram_array
def counter(ngram_array):
def item_get (array, key):
if key in array.keys ():
return array [key]
else:
return 0
if sys.version_info < (2, 7):
# compatibility
map = {}
if ngram_array is not None:
if hasattr (ngram_array, "iteritems"):
for elem, count in ngram_array.iteritems():
map[elem] = item_get(map,elem) + count
else:
for elem in ngram_array:
map[elem] = item_get(map,elem) + 1
return map
else:
import collections
return collections.Counter(ngram_array)
def count_ngrams(plain_text, n):
ngram_array = make_ngrams(plain_text, n)
ngrams = counter(ngram_array)
return ngrams
#count the probability of unigrams, return logaritmus of probability to avoid small numbers
#probability = count of found / total count
def probability(unigrams):
total = sum(unigrams.values())
unigram_probability = {}
for the_unigram in unigrams:
unigram_probability[the_unigram] = -math.log(unigrams[the_unigram]/total, 10)
return unigram_probability
#probability = count of certain bigram / total count bigrams with the same first letter
def probability_of_bigram(bigrams):
bigram_probability = {}
for the_bigram in bigrams:
sum_of_frequency = 0
for the_bigram2 in bigrams:
if (the_bigram[0] == the_bigram2[0]):
sum_of_frequency += bigrams[the_bigram2]
bigram_probability[the_bigram] = -math.log(bigrams[the_bigram]/sum_of_frequency, 10)
if bigram_probability[the_bigram] == -0.00:
del bigram_probability[the_bigram] #trash: throw away :)
return bigram_probability
#probability = count of certain trigram / total count of trigrams with the same first two letters
def probability_of_trigram(trigrams):
trigram_probability = {}
for the_trigram in trigrams:
sum_of_frequency = 0
for the_trigram2 in trigrams:
if (the_trigram[0] == the_trigram2[0]) & (the_trigram[1] == the_trigram2[1]):
sum_of_frequency += trigrams[the_trigram2]
trigram_probability[the_trigram] = -math.log(trigrams[the_trigram]/sum_of_frequency, 10)
if trigram_probability[the_trigram] == -0.00:
del trigram_probability[the_trigram]
return trigram_probability