-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathewordcount.py
executable file
·54 lines (41 loc) · 1.56 KB
/
ewordcount.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python
# Python program to solve the Knuth-McIlroy problem
# -- enhanced version to handle contractions
# See here: http://www.leancrew.com/all-this/2011/12/more-shell-less-egg/
#
# Problem spec:
# Read a file of text, determine the n most frequently used words, and
# print out a sorted list of those words along with their frequencies.
#
# The enhanced version of McIlroy's shell pipeline solution is in ewordcount.sh
# for comparison.
from collections import defaultdict
from re import compile, sub
# Hacks to make sure every byte except lowercase ASCII and apostrophe gets
# translated to a space
rexp1 = compile(r"([^A-Za-z])\'")
rexp2 = compile(r"\'([^A-Za-z])")
rexp3 = compile(r"[^a-z\']")
def uniq(words):
"""Return dict of words vs. occurrences from iterable of words.
"""
counts = defaultdict(lambda: 0)
for word in words:
counts[word] += 1
return counts
def pipeline(s, n):
"""Return sorted list of n most used words in s, handling contractions.
Note that we sort by both word count and word to match the behavior of
the shell pipeline (multiple words with the same count are output in
reverse alphabetical order).
"""
return sorted(
uniq(
sub(rexp3, " ", sub(rexp2, " ", sub(rexp1, " ", s)).lower())
.split()
).iteritems(), key=lambda x: (x[1], x[0]), reverse=True)[:n]
if __name__ == '__main__':
import sys
for w, f in pipeline(sys.stdin.read(), int(sys.argv[1])):
# Match the shell pipeline output format
print str(f).rjust(7), w