-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain.py
109 lines (79 loc) · 3.22 KB
/
Main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import math
from Corpus import Corpus
from ClusteredCorpus import ClusteredCorpus
from Document import Document
from prettytable import PrettyTable
import os
import operator
def main():
clustered_corpus_path = 'clustered_corpus'
clustered_corpus = read_clustered_corpus(clustered_corpus_path)
corpus = merge_clustered_corpus_into_a_single_corpus(clustered_corpus)
target_file_path = 'target.txt'
text = read_text_file(target_file_path)
document = Document(text)
corpus = Corpus(corpus)
clustered_corpus = ClusteredCorpus(clustered_corpus)
candidate_to_rank_mapping = {}
candidate_to_params_mapping = {}
candidate_to_dfs_in_each_cluster_mapping = {}
for candidate in document.get_candidates():
tf = math.log(1.0 + document.get_tf_for(candidate), 10.0)
# tf = document.get_tf_for(candidate)
idf = math.log(1.0 + 1.0 / corpus.get_df_for(candidate), 2.0)
cu = clustered_corpus.get_cu_for(candidate)
rank = cu
# rank = tf * cu
# rank = tf * idf
dfs_in_each_cluster = clustered_corpus.get_dfs_in_each_cluster_for(candidate)
candidate_representative = corpus.get_representative_for(candidate)
candidate_to_rank_mapping[candidate_representative] = rank
candidate_to_params_mapping[candidate_representative] = (tf, idf, cu)
candidate_to_dfs_in_each_cluster_mapping[candidate_representative] = dfs_in_each_cluster
table = generate_table_based_on(
candidate_to_rank_mapping,
candidate_to_params_mapping,
candidate_to_dfs_in_each_cluster_mapping
)
save_as_file(table)
print('Done.')
def read_clustered_corpus(path):
result = []
for directory in os.listdir(path):
cluster = []
for file in os.listdir(os.path.join(path, directory)):
text_file = read_text_file(os.path.join(path, directory, file))
document = Document(text_file)
cluster.append(document)
result.append(cluster)
return result
def merge_clustered_corpus_into_a_single_corpus(clustered_corpus):
result = []
for cluster in clustered_corpus:
result.extend(cluster)
return result
def read_text_file(path):
return open(path, 'r', encoding='utf-8').read()
def generate_table_based_on(
candidate_to_rank_mapping,
candidate_to_params_mapping,
candidate_to_dfs_in_each_cluster_mapping
):
result = PrettyTable(['Candidate', 'Rank', 'TF', 'IDF', 'CU', 'DF_1', 'DF_2'])
sorted_mapping = sorted(candidate_to_rank_mapping.items(), key=operator.itemgetter(1), reverse=True)
for candidate, rank in sorted_mapping:
params = candidate_to_params_mapping[candidate]
tf = params[0]
idf = params[1]
cu = params[2]
dfs_in_each_cluster = candidate_to_dfs_in_each_cluster_mapping[candidate]
df_1 = dfs_in_each_cluster[0]
df_2 = dfs_in_each_cluster[1]
result.add_row([candidate, str(rank), str(tf), str(idf), str(cu), str(df_1), str(df_2)])
return result
def save_as_file(table):
table.align = 'l'
with open('result.txt', mode='w', encoding='utf-8') as result_file:
result_file.write(table.get_string())
if __name__ == "__main__":
main()