-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathnorvig.rkt
executable file
·142 lines (117 loc) · 4.43 KB
/
norvig.rkt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env racket
#lang racket/base
;;; Norvig's spelling corrector
;; -----------------------------------
;; Authors:
;; Jyotirmoy Bhattacharya ([email protected])
;; Matthias Felleisen
(provide train correct edits1)
(require racket/port)
(require racket/list)
(require racket/cmdline)
(define ALPHABET (map string (string->list "abcdefghijklmnopqrstuvwxyz")))
;; type HFrequency = [Hashof String Nat]
;; type LFrequency = [Listof [Cons String Nat]]
;; String[Filename] -> Void
;; looks at each word on standard-in and prints it together with a correction
(define (main training-file)
(define m (train training-file))
(for ([l (in-lines)]
#:when (not (string=? l ""))
[w (in-value (string-downcase l))])
(printf "~a, ~a\n" w (correct m w))))
;; String[Filename] -> HFrequency
;; Return a hash table representing the frequency of words in the file given as an argument
;; effect: open fname for input and leave it open
(define (train fname)
(freqs (words (port->string (open-input-file fname)))))
;; String -> [Listof String]
;; Extracts words from a string and convert them to lowercase
(define (words buf)
(regexp-match* #rx"[a-z]+" (string-downcase buf)))
;; [Listof String] -> HFrequency
;; Take a list of words, return a hash table with words as keys and frequencies as values
(define (freqs xs)
(define m (make-hash))
(for ([x (in-list xs)])
(hash-update! m x add1 0))
m)
;; HFrequency String -> String
;; Returns the correction for a word.
;; Returns the word itself if no correction is found.
(define (correct m s)
(define (best-known xs) (best (known m xs)))
(or (best-known (list s))
(best-known (edits1 s))
(best-known (edits2 m s))
s))
;; HFrequency [Listof String] -> LFrequency
;; Given a hash map and a list of words, returns a list of (word,frequency) pairs for each word
;; that is in the hash map.
(define (known m xs)
(for*/list ([x (in-list xs)] [v (in-value (hash-ref m x #f))] #:when v)
(cons x v)))
;; LFrequency -> String or False
;; Given a list of (word,frequency) pairs, returns one of the words with the highest frequency.
;; Given an empty list returns #f
(module+ test
(require rackunit)
(check-equal? (best '(("a" . 2) ("b" . 3) ("c" . 0))) "b")
(check-true (cons? (member (best '(("a" . 2) ("b" . 3) ("c" . 3))) '("c" "b"))))
(check-false (best '())))
(define (best xs)
(if (empty? xs) #f (car (argmax cdr xs))))
;; String -> [Listof Sting]
;; create all words of edit distance 1
(define (edits1 s)
(define ss (splits s))
(append (deletes ss) (inserts ss) (replaces ss) (transposes ss)))
;; HFrequency String -> [Listof Sting]
;; create all words of edit distance 1
(define (edits2 m s)
(for*/list ([x (in-list (edits1 s))][y (in-list (edits1 x))]#:when (hash-ref m y #f))
y))
;; type Splits = [Listof [Cons String String]]
;; String -> Splits
;; The different ways to split a string
(define (splits s)
(for/list ([n (in-range (add1 (string-length s)))])
(cons (substring s 0 n) (substring s n))))
;; Splits -> [Listof String]
;; One character editing functions. Take a split from splits and return a list of words
(module+ test
(check-equal? (deletes '(("" . "abc") ("a" . "bc") ("ab" . "c") ("abc" . ""))) '("bc" "ac" "ab")))
(define (deletes ss)
(for*/list ([s (in-list ss)] [rht (in-value (cdr s))] #:when (not (string=? rht "")))
(string-append (car s) (substring rht 1))))
;; Splits -> [Listof String]
(define (inserts ss)
(for*/list ([s (in-list ss)] [c (in-list ALPHABET)])
(string-append (car s) c (cdr s))))
;; Splits -> [Listof String]
(define (replaces ss)
(for*/list ([s (in-list ss)]
[rht (in-value (cdr s))]
#:when (not (string=? rht ""))
[rht-tail (in-value (substring rht 1))]
[c (in-list ALPHABET)])
(string-append (car s) c rht-tail)))
;; Splits -> [Listof String]
(define (transposes ss)
(for*/list ([s (in-list ss)] [r (in-value (cdr s))] #:when (>= (string-length r) 2))
(string-append (car s) (string (string-ref r 1)) (string (string-ref r 0)) (substring r 2))))
(module+ main
;; The main program.
;; Must be called as
;; norvig.rkt [training file]
;; Learns word frequencies from the [training file],
;; then reads one word per line from standard
;; input and print lines of the form
;; word, correction
;; to standard output.
(define training-file
(command-line
#:program "norvig"
#:args (filename)
filename))
(main training-file))