-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhuffman.py
92 lines (76 loc) · 2.64 KB
/
huffman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# Example Huffman coding implementation
# Distributions are represented as dictionaries of { 'symbol': probability }
# Codes are dictionaries too: { 'symbol': 'codeword' }
def huffman(p):
'''Return a Huffman code for an ensemble with distribution p.'''
#assert(sum(p.values()) == 1.0) # Ensure probabilities sum to 1
# Base case of only two symbols, assign 0 or 1 arbitrarily
if(len(p) == 2):
return dict(zip(p.keys(), ['0', '1']))
# Create a new distribution by merging lowest prob. pair
p_prime = p.copy()
a1, a2 = lowest_prob_pair(p)
p1, p2 = p_prime.pop(a1), p_prime.pop(a2)
p_prime[a1 + a2] = p1 + p2
# Recurse and construct code on new distribution
c = huffman(p_prime)
ca1a2 = c.pop(a1 + a2)
c[a1], c[a2] = ca1a2 + '0', ca1a2 + '1'
return c
def lowest_prob_pair(p):
'''Return pair of symbols from distribution p with lowest probabilities.'''
assert(len(p) >= 2) # Ensure there are at least 2 symbols in the dist.
sorted_p = sorted(p.items(), key=lambda (i,pi): pi)
return sorted_p[0][0], sorted_p[1][0]
def create_prob_dist(text):
prob_dist = {}
counts = {}
total = 0
for s in text + "!":
if s in counts.keys():
counts[s] = counts[s] + 1
total = total + 1
else:
counts[s] = 1
total = total + 1
for k in counts.keys():
prob_dist[k] = round(float(counts[k]) / float(total), 5)
summer = 0.0
summer = sum(prob_dist.values())
if summer != 1.0:
diff = 1.0 - summer
#print "Difference " + str(diff)
use_key = prob_dist.keys()[0]
#print "Key " + use_key
prob_dist[use_key] = round(prob_dist[use_key] + diff, 5)
return prob_dist
def hc_encode(text):
prob_dist = create_prob_dist(text)
#print prob_dist
huffman_tree = huffman(prob_dist)
#print huffman_tree
code_to = ""
for s in text:
code_to = code_to + huffman_tree[s]
code_to = code_to# + huffman_tree["!"]
return code_to
def demo_dickens():
txt = ""
with open('dickens') as f:
txt = f.read(200)
print "Text contains" + str(len(txt)) + " characters"
xx = hc_encode(txt)
print "Resulting code: " + str(xx)
#evaluate
baits = xx.__len__() / 8
print baits
# Example execution
#ex1 = { 'a': 0.5, 'b': 0.25, 'c': 0.25 }
#a = huffman(ex1) # => {'a': '0', 'c': '10', 'b': '11'}
#print a
#ex2 = { 'a': 0.25, 'b': 0.25, 'c': 0.2, 'd': 0.15, 'e': 0.15 }
#a = huffman(ex2) # => {'a': '01', 'c': '00', 'b': '10', 'e': '110', 'd': '111'}
#print a
#print hc_encode("abaaaabacbbb")
#print hc_encode("aeaeouuiiiuiui")
#demo_dickens()