-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathhuffman_coding.ml
123 lines (110 loc) · 3.5 KB
/
huffman_coding.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
type t =
| Leaf of {
weight: int;
char: char;
}
| Node of {
weight: int;
left: t;
right: t;
}
let weight = function
| Leaf { weight = w; _ } -> w
| Node { weight = w; _ } -> w
module NodeMinHeap = Binary_heap.BinaryHeap (struct
type nonrec t = t
let compare a b = compare (weight a) (weight b)
(* It's pretty silly that Ord requires a `show` method, but useful
for debugging I guess *)
let rec show = function
| Leaf l ->
"[" ^ (string_of_int l.weight) ^ " " ^ (String.make 1 l.char) ^ "]"
| Node n ->
"[" ^ (string_of_int n.weight) ^ " " ^
"(" ^ (show n.left) ^ ", " ^ (show n.right) ^ ")]"
end)
(* TODO: frequency analysis based on a block size *)
let frequency str =
let tbl = Hashtbl.create 40 in
let char_list = Trie.explode str in
List.iter
(fun ch ->
if Hashtbl.mem tbl ch then
Hashtbl.replace tbl ch (1 + Hashtbl.find tbl ch)
else
Hashtbl.add tbl ch 1)
char_list;
(* Return the table *)
tbl
let code_tree str =
let extract_two heap =
let (a, heap_once) = NodeMinHeap.extract heap in
let (b, heap_twice) = NodeMinHeap.extract heap_once in
match (a, b) with
| (Some a_, Some b_) -> (a_, b_, heap_twice)
| _ -> raise (Invalid_argument "Cannot extract twice from this heap") in
(* Turn frequency analysis key/values into leaves and insert them into
a min-heap *)
let create_min_heap tbl =
Hashtbl.fold
(fun k -> fun v -> fun heap ->
Leaf { char = k;
weight = v;
}
|> NodeMinHeap.insert heap)
tbl
NodeMinHeap.empty_tree in
(* Reduce the tree by extracting the two smallest items, combining
them, and re-inserting into the min-heap *)
let rec reduce_min_heap heap =
if NodeMinHeap.height heap = 1 then heap
else
let (a, b, new_heap) = extract_two heap in
Node {
weight = weight a + weight b;
left = a;
right = b;
}
|> NodeMinHeap.insert new_heap
|> reduce_min_heap in
frequency str
|> create_min_heap
|> reduce_min_heap
|> NodeMinHeap.peek
|> function
| Some coding -> coding
| None -> raise (Invalid_argument "failed to code string")
let build_dictionary coding =
let rec inner node coding_thus_far = match node with
| Leaf { char = ch; _ } ->
(* reverse `coding_thus_far` because we append bits to the front
while recursing *)
[(ch, List.rev coding_thus_far)]
| Node { left = l; right = r; _ } ->
List.append
(inner l (0 :: coding_thus_far))
(inner r (1 :: coding_thus_far)) in
inner coding []
let encode str dictionary =
Trie.explode str |>
List.fold_left
(fun acc -> fun ch ->
List.assoc ch dictionary |>
List.append acc)
[]
let decode tree stream =
let rec inner node stream = match (node, stream) with
(* Check this case first so we don't accidentally forget our
character when our stream is empty! *)
| (Leaf { char = c; _ }, []) -> [c]
(* Now check base cases (empty stream, leaf) *)
| (_, []) -> []
| (Leaf { char = c; _ }, _) -> c :: inner tree stream
(* Recursive step, pop the stream and traverse our coding tree
accordingly *)
| (Node { left = l; right = r; _ }, hd :: tail) ->
if hd = 0 then inner l tail
else if hd = 1 then inner r tail
else raise (Invalid_argument
("unable to parse stream (bit: " ^ (string_of_int hd) ^ ")"))
in inner tree stream