-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
64 lines (53 loc) · 1.81 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import time
time0 = time.time()
import sys
sys.setrecursionlimit(100000) # set the maximum depth as 1500
# 以上为必要的前期处理 以下为逻辑部分
import MeCab
def filter(text):
# 将原文中“[]《》”之间的文字(青空文库格式txt文件中的)删除
# 传入一个字符串,输出过滤后的字符串
flag = True
news = ''
for char in text:
if char in '[《':
flag = False
if flag:
news += char
if char in ']》':
flag = True
return news
def sort(words):
# 为单词计数列表排序,使用归并排序算法,传出有小到大排序后的列表
if len(words) <= 1:
return words
else:
small = []
big = []
middle = words.pop()
for word in words:
if word[0] > middle[0]:
big.append(word)
else:
small.append(word)
return sort(small) + [middle] + sort(big)
with open('wagahaiwa_nekodearu.txt', encoding = 'shift-JIS') as textfile:
text = textfile.read()
text = filter(text)
# 读取作品「我輩は猫である」并处理
m = MeCab.Tagger("") # MeCab分词器初始化
words = m.parse(text).split('\n') # 分词处理
dictcounter = {}
for word in words: # 词语计数
if word in dictcounter:
dictcounter[word] += 1
else:
dictcounter[word] = 1
words = []
for word in dictcounter: # 将字典中的计数信息转到列表中以方便排序
count = dictcounter[word]
words.append((count, word))
words = sort(words) # 对列表排序
for word in words[:-100:-1]: # 输出计数最高的100个词
print(word[0], word[1])
print(time.time() - time0) # 显示程序运行用时