-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanagrams.py
317 lines (249 loc) · 12 KB
/
anagrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
#!/usr/bin/python
#Copyright 2011, 2012 Mark Santesson
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
Anagram generator. Use prime numbers to represent the phrase given.
Each letter has a unique prime number. A phrase's signature is
the product of all the prime numbers assigned to the component
letters. The signatures for any set of phrases which are anagrams
of each other will all be the same number.
By Mark Santesson.
'''
import time, os, sys, collections
timings = []
last_time = time.clock()
def elapsedTime():
'''Return the number of seconds that has passed since the last time this
function was called. The first time it is called it will return the time
since the function was defined.'''
global last_time
this_time = time.clock()
elap_time = this_time - last_time
last_time = this_time
return elap_time
def addTiming(desc):
timings.append( ( elapsedTime(), desc, ) )
def nextprime(x):
"""Quick prime number calculator. Pass in a number and it will return
the smallest prime strictly larger than the input. It is very
inefficient."""
# Create a function which returns the next prime larger than the
# passed value. A faster way to do this is to keep track of all
# primes seen so far and then start trying primes incrementally
# higher than that, dividing only by the other primes already
# found. You can also stop at sqrt(x). I'm only going to generate
# 26 of them, so this easy method is fine.
while True: # On the theory that there are an infinite number of primes...
x += 1
for pot_fac in range(2,x): # Not even using the sqrt(x) or even numbers optimizations!
if x % pot_fac == 0:
break
else:
return x
def prime_generator():
'''Returns a generator that produces the next largest prime as
compared to the one returned from this function the last time
it was called. The first time it is called it will return 2.'''
lastprime = 1
while True:
lastprime = nextprime(lastprime)
yield lastprime
class AnagramGenerator:
'''The AnagramGenerator class produces all anagram combinations
that are possible from the input phrase using the given dictionary.
If no dictionary is given to the constructor (as a filename)
then it assumes "words" is the filename.'''
def __init__(self, dictionary_filename='words'):
'''Make a AnagramGenerator object. There is no way to
switch dictionaries; create a new instance instead.'''
self.dictionary_filename = dictionary_filename
self.__loadDictionary()
def __getPrime(self, c):
'''Returns the prime number that represents the passed
in character. The character must be lower case.'''
# assert c==c.lower()
return self.chars_list[ ord(c) - ord('a') ]
def __getSignature(self, s):
'''Returns the integer that represents the characters in
a string. The integer is the product of the prime number
representation of each character in the string.'''
s = s.lower()
sig = reduce(lambda x,y: x * self.__getPrime(y),
[ch for ch in s if ord('a') <= ord(ch) <= ord('z')], 1)
assert sig >= 1, repr( ('Product of primes produced non-positive number',s, sig, [ (z,self.__getPrime(z)) for z in s if ord('a') <= ord(z) <= ord('z') ]) )
return sig
def __loadDictionary(self):
'''Load a dictionary from the given input file. The file
should have one word per line.'''
addTiming('Start of loading dictionary from "%s"' % (self.dictionary_filename,))
# Count chars in the dictionary. Create a dict of lower
# case character to the number of instances found in the
# words dictionary. It might be useful to break this
# function into two parts to enable custom dictionaries
# passed in as a list/string instead of just specifying
# a dictionary filename. However, since I'm testing
# the performance of this script, I need to combine the
# counting and the line loading to prevent a second
# loop. There are ways around that, but it isn't a
# focus for now. If loading from something other than
# a dictionary file is needed then this is the place
# to make that change.
all_lines = list()
counts = dict([ (chr(x),0) for x in range(ord('a'), ord('z')+1) ])
for line in open( self.dictionary_filename ).readlines():
all_lines.append(line.strip())
for c in line.lower():
if ord('a') <= ord(c) <= ord('z'):
counts[ c ] += 1
# Construct a dict mapping from lower case character to the
# prime number for it. Initialize it in the order of most
# used character first, so that the letters most frequently
# used get lower numbers in the hopes that the products
# will be smaller numbers.
primes = prime_generator() # get the prime number generator
chars_map = dict( [ (x,primes.next()) for x,y in \
sorted( counts.items(), key=lambda i:i[1],
reverse=True ) ] )
# Recast the dict as a list, where the index is the index
# in the alphabet. The value is the prime number for that
# character.
self.chars_list = [ chars_map[chr(x)] for x in range(ord('a'), ord('z') + 1) ]
addTiming( 'Assign Primes to the alphabet.' )
# Insert all the dictionary words into the list
self.sigs = collections.defaultdict( list )
for word in all_lines:
sig = self.__getSignature( word )
if sig > 1: # ignore lines without letters
self.sigs[ sig ].append( word )
# We need the keys in list form so we can have it sorted.
# Lookups into the signatures dictionary are pretty rare.
self.sigs_keys = sorted( self.sigs.keys() )
addTiming( 'Create signatures dictionary.' )
def __getSignatureKeys(self):
'''Returns the sorted list of all signature keys. This is
used to populate a list of all available words.'''
return self.sigs_keys
def formulateAnagramPhraseCombo(self, res):
'''Render an anagram "phrase" into text for display. Each key
is separate by double spaces, and each key displays as a comma
separated list of the words choices.'''
return " ".join( [ ','.join(self.sigs[s]) for s in res ] )
def __solveAnagramPhrase(self, letters, unreduced_keys, start=0, \
so_far=[], display_progress=False):
'''This is the recursive function that helps produce anagrams.
It should not be called directly.'''
# letters: Product of signatures of letters remaining.
# unreduced_keys: Keys representing items in the dictionary
# that can non necessarily be reached by the remaining
# letters.
# start: The index of unreduced_keys to start considering.
# keys before this do not need to be considered in
# combination with this pick because earlier keys
# will be considered separately and they will consider
# combining with this pick as part of their recursion.
# The start value should be considered again in case
# the same key can be used multiple times.
# so_far: A list containing the keys that have been picked
# so far in this chain of recursions.
# display_progress: Flag, if true then results are displayed
# as they are discovered.
if letters == 1:
# There are no letters left.
if display_progress:
print "Result: %s - %s" % \
( repr(so_far),
self.formulateAnagramPhraseCombo(so_far) )
return [ so_far ]
# Filter list of keys to remove any that can no longer
# be constructed using some of the input letters.
reduced_keys = [ x for x in unreduced_keys[start:] if letters % x == 0 ]
result = []
# Recurse on all items remaining in the dictionary.
for index,sig in enumerate(reduced_keys):
remaining_letters = letters // sig
result += self.__solveAnagramPhrase(
letters = remaining_letters,
unreduced_keys = reduced_keys,
start = index,
so_far = so_far + [ sig ],
display_progress= display_progress
)
return result
def anagrams( self, phrase_string, display_progress=False ):
'''This function takes an input phrase string and returns
a list of all anagrams that can be generated from it. The
return value is a list of lists of lists. The inner lists
represent the individual words which are interchangeable
in an anagram. For instance, "tarp" and "trap" are
anagrams of each other. Whenever one can be used, the
other can substitute. They will be presented together
in the innermost list. An example output for "lumberjack"
might be:
[ [ ['me'], ['bark'], ['Cluj'] ],
[ ['me'], ['blur','burl'], ['jack'], ],
[ ['be'], ['mark'], ['Cluj'] ]
[ ['am'], ['jerk'], ['club'] ]
[ ['elm','Mel'], ['jar'], ['buck'] ]
...
]
'''
all_keys = self.__getSignatureKeys()
if options.dumpdict:
for k,v in sorted(sigs.items(), key=lambda x: x[0]):
print repr(k),'->',repr(v)
r = self.__solveAnagramPhrase(
self.__getSignature(phrase_string),
all_keys,
display_progress=display_progress )
r = [ [ self.sigs[s] for s in row ] for row in r ]
addTiming( 'Solve anagrams for "%s": %d rows' % (phrase_string, len(r),) )
return r
if __name__ == "__main__":
import optparse
description = '''\
Anagram generator by Mark Santesson. This program rearranges letters
in any non-option command line parameters and output the anagrams
that can be generated from them. The potential words are filtered
through a dictionary. To generate anagrams for more than one phrase,
separate the phrases with (escaped) semicolons.
'''
parser = optparse.OptionParser(
usage='usage: %prog [options] args',
description=description )
parser.add_option('-t', '--time', action='store_true', default=False,
dest='timing', help='Display timing information but not anagrams generated.')
parser.add_option('-d', '--dictionary', default='words',
dest='dictionary_name',
help='Specify a location for the dictionary. Default is "%default".')
parser.add_option('--dumpdict', action='store_true', default=False,
dest='dumpdict',
help='Dump dictionary after load, for debug purposes.')
options, test = parser.parse_args()
if len(test) >= 1:
test = ' '.join(test).split(';')
else:
test = [ "face", "astronomy", "SETEC Astronomy" ]
ag = AnagramGenerator( options.dictionary_name )
for x in test:
print "%s:" % x
ag.anagrams( x, not options.timing )
if options.timing:
print "Timings:"
for et,desc in timings:
print "%f - %s" % (et, desc,)
print "Done.\n"
# 932 seconds to solve SETEC Astronomy without limiting dictionary.
# The C version took 161 secs.
# 21 seconds after reducing the keys from the dict.
# With dictionary reduction at every recursion, it goes down to 4.25 seconds.