-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathPositionalIndex.py
125 lines (94 loc) · 3.34 KB
/
PositionalIndex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# importing libraries
import numpy as np
import os
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer
from natsort import natsorted
import string
def read_file(filename):
with open(filename, 'r', encoding ="ascii", errors ="surrogateescape") as f:
stuff = f.read()
f.close()
# Remove header and footer.
stuff = remove_header_footer(stuff)
return stuff
def remove_header_footer(final_string):
new_final_string = ""
tokens = final_string.split('\n\n')
# Remove tokens[0] and tokens[-1]
for token in tokens[1:-1]:
new_final_string += token+" "
return new_final_string
def preprocessing(final_string):
# Tokenize.
tokenizer = TweetTokenizer()
token_list = tokenizer.tokenize(final_string)
# Remove punctuations.
table = str.maketrans('', '', '\t')
token_list = [word.translate(table) for word in token_list]
punctuations = (string.punctuation).replace("'", "")
trans_table = str.maketrans('', '', punctuations)
stripped_words = [word.translate(trans_table) for word in token_list]
token_list = [str for str in stripped_words if str]
# Change to lowercase.
token_list =[word.lower() for word in token_list]
return token_list
# In this example, we create the positional index for only 1 folder.
folder_names = ["comp.graphics"]
# Initialize the stemmer.
stemmer = PorterStemmer()
# Initialize the file no.
fileno = 0
# Initialize the dictionary.
pos_index = {}
# Initialize the file mapping (fileno -> file name).
file_map = {}
for folder_name in folder_names:
# Open files.
file_names = natsorted(os.listdir("20_newsgroups/" + folder_name))
# For every file.
for file_name in file_names:
# Read file contents.
stuff = read_file("20_newsgroups/" + folder_name + "/" + file_name)
# This is the list of words in order of the text.
# We need to preserve the order because we require positions.
# 'preprocessing' function does some basic punctuation removal,
# stopword removal etc.
final_token_list = preprocessing(stuff)
# For position and term in the tokens.
for pos, term in enumerate(final_token_list):
# First stem the term.
term = stemmer.stem(term)
# If term already exists in the positional index dictionary.
if term in pos_index:
# Increment total freq by 1.
pos_index[term][0] = pos_index[term][0] + 1
# Check if the term has existed in that DocID before.
if fileno in pos_index[term][1]:
pos_index[term][1][fileno].append(pos)
else:
pos_index[term][1][fileno] = [pos]
# If term does not exist in the positional index dictionary
# (first encounter).
else:
# Initialize the list.
pos_index[term] = []
# The total frequency is 1.
pos_index[term].append(1)
# The postings list is initially empty.
pos_index[term].append({})
# Add doc ID to postings list.
pos_index[term][1][fileno] = [pos]
# Map the file no. to the file name.
file_map[fileno] = "20_newsgroups/" + folder_name + "/" + file_name
# Increment the file no. counter for document ID mapping
fileno += 1
# Sample positional index to test the code.
sample_pos_idx = pos_index["andrew"]
print("Positional Index")
print(sample_pos_idx)
file_list = sample_pos_idx[1]
print("Filename, [Positions]")
for fileno, positions in file_list.items():
print(file_map[fileno], positions)