-
Notifications
You must be signed in to change notification settings - Fork 1
/
q1_to_5_CS276.py
99 lines (81 loc) · 3.46 KB
/
q1_to_5_CS276.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# This script loads the data from CS276 to count the tokens of the dataset
import os
import pickle
from math import log
import matplotlib.pyplot as plt
from index import *
#======================================FIRST PART : TOKENIZATION================================================================
tokenize = False # Set to True if you want to do again the tokenization and the dictionnary, otherwise, the saved dictionnary will be loaded
def load_saved_dictionnary(file_name):
"""Load the saved dictionnary with pickle"""
with open(file_name, 'rb') as handle:
dict = pickle.load(handle)
return dict
def save_dictionnary(file_name, dict_object):
"""Save the dictionnary"""
with open(file_name, 'wb') as handle:
pickle.dump(dict_object, handle, protocol=pickle.HIGHEST_PROTOCOL)
def read_common_words(filePath):
"""Returns the list of the common words listed in the specified file path"""
file = open(filePath, "r")
return file.read().splitlines()
def fillDict(directory, tokens):
"""Fill the dictionnay 'tokens' with tokens in the specified directory without any token in the list common_words"""
listFiles = os.listdir(directory)
for fileName in listFiles:
file = open(os.path.join(directory, fileName), "r")
content = file.readlines()
for line in content:
words = Index.normalize(line)
for word in words:
tokens[word] = tokens.get(word, 0) + 1
def analysis(dir_path, redo_tokenization=False):
tokens = dict()
if redo_tokenization:
print("Reading the dataset")
for i in range(5):
directory = os.path.join(dir_path, str(i))
print("Reading files from directory " + str(i) + " ...")
fillDict(directory, tokens)
save_dictionnary("dictionnaireHalf.pkl",tokens)
for i in range(5,10):
directory = os.path.join(dir_path, str(i))
print("Reading files from directory " + str(i) + " ...")
fillDict(directory, tokens)
save_dictionnary("dictionnaire.pkl", tokens)
else :
print("Checking the saved results without reading again the dataset")
tokens = load_saved_dictionnary("dictionnaire.pkl")
#Question1
print("Question 1 : "+str(sum(tokens.values()))+" terms in the vocabulary")
#Question2
print("Question 2 : "+str(len(tokens))+" distinct tokens")
#Question3
T = sum(tokens.values())
V = len(tokens)
tokens_half = load_saved_dictionnary("dictionnaireHalf.pkl")
Tp = sum(tokens_half.values())
Vp = len(tokens_half)
b = (log(V) - log(Vp)) / (log(T) - log(Tp))
k = V/(T**b)
print("Question 3 : Heaps Law : k = {} and b = {}".format(k,b))
#Question4
print("Question 4 : For 1 million tokens, the vocabulary size would be {}".format(int(k*1e6**b)))
#Question5
print("Question 5")
frequencies = sorted(tokens.values(),reverse=True)
ranks = [i+1 for i in range(len(frequencies))]
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax1.plot(ranks,frequencies, color='blue', lw=2)
ax1.set_xlabel("Rang")
ax1.set_ylabel("Fréquence")
ax1.set_title("Graphe fréquence vs rang")
ax2 = fig.add_subplot(2, 1, 2)
ax2.plot(ranks,frequencies, color='blue', lw=2)
ax2.set_xscale('log')
ax2.set_yscale('log')
ax2.set_xlabel("Rang")
ax2.set_ylabel("Fréquence")
ax2.set_title("Graphe fréquence vs rang (echelle log)")
plt.show()