-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathApriori.py
164 lines (137 loc) · 4.88 KB
/
Apriori.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import time
import openpyxl
from logger import Logger
def load_dataset(data_path):
""" load data from a specific path """
workbook = openpyxl.load_workbook(data_path)
sheet = workbook['Sheet3']
rows = sheet.max_row + 1
cols = sheet.max_column + 1
dataset = []
for i in range(1, rows):
temp = []
for j in range(1, cols):
cell = sheet.cell(i, j).value
if cell:
temp.append(cell)
dataset.append(temp)
return dataset
def create_C1(dataset):
""" create an initial candidate set """
C1 = set()
for t in dataset:
for item in t:
item_set = frozenset([item]) # convert array into set
C1.add(item_set)
return C1
def is_apriori(Ck_item, Lksub1):
for item in Ck_item:
sub = Ck_item - frozenset([item])
if sub not in Lksub1:
# sub is not a k-1 frequent item
return False
return True
def create_Ck(Lksub1, k):
""" create k-candidate set by Lk-1 """
Ck = set()
len_Lksub1 = len(Lksub1)
list_Lksub1 = list(Lksub1)
for i in range(len_Lksub1):
for j in range(1, len_Lksub1):
l1 = list(list_Lksub1[i])
l2 = list(list_Lksub1[j])
l1.sort()
l2.sort()
if l1[:k-2] == l2[:k-2]:
new_l = list_Lksub1[i] | list_Lksub1[j]
if is_apriori(new_l, Lksub1):
Ck.add(new_l)
return Ck
def generate_Lk_by_Ck(dataset, Ck, min_support, support_data):
""" generate frequent item sets by candidate sets """
Lk = set()
item_count = {}
for t in dataset:
for item in Ck:
if item.issubset(t):
# count when item appears in t
if item in item_count:
item_count[item] += 1
else:
item_count[item] = 1
t_num = float(len(dataset))
for item in item_count.keys():
# add items whose support >= min_support
if item_count[item] / t_num >= min_support:
Lk.add(item)
support_data[item] = item_count[item] / t_num
return Lk
def generate_L(dataset, k, min_support):
""" generate all of frequent item sets and corresponding support """
support_data = {} # save the support of different frequent items
C1 = create_C1(dataset)
L1 = generate_Lk_by_Ck(dataset, C1, min_support, support_data) # generate L1 by C1
Lksub1 = L1.copy()
L = []
L.append(L1)
for i in range(2, k+1):
Ci = create_Ck(Lksub1, i)
Li = generate_Lk_by_Ck(dataset, Ci, min_support, support_data)
Lksub1 = Li.copy()
L.append(Li)
return L, support_data
def generate_big_rules(L, support_data, min_conf):
""" generate strong-association-rules by frequent item sets """
big_rules = []
sub_set_list = []
for i in range(len(L)):
for freq_set in L[i]:
for sub_set in sub_set_list:
if sub_set.issubset(freq_set):
conf = support_data[freq_set] / support_data[freq_set-sub_set]
big_rule = (freq_set-sub_set, sub_set, conf, support_data[freq_set])
if conf >= min_conf and (big_rule not in big_rules):
big_rules.append(big_rule)
sub_set_list.append(freq_set)
return big_rules
class config:
""" combinations of support and confidence """
support_list = [0.05, 0.1, 0.2, 0.3]
conf_list = [0.8, 0.85, 0.9, 0.95]
if __name__ == '__main__':
# load your dataset and mine strong association rules
data_path = r"./Database/dataset.xlsx"
dataset = load_dataset(data_path)
log_dir = r"./log"
if log_dir is not None and not os.path.exists(log_dir):
os.mkdir(log_dir)
total = 1 # record how many experiments have been completed
for min_support in config.support_list:
for min_conf in config.conf_list:
# create specific logger
title = "shopping" + "_" + str(min_support) + "_" + str(min_conf) + ".txt"
logger = Logger(os.path.join(log_dir, title))
# association rules mining
start_time = time.time()
L, support_data = generate_L(dataset, k=3, min_support=min_support)
big_rule_list = generate_big_rules(L, support_data, min_conf=min_conf)
end_time = time.time()
duration = end_time - start_time
# save the mining result
logger.write(f"Experiment #{total} Min_Sup : {min_support}, Min_Conf : {min_conf}\n")
logger.write("----------------------------------------------\n")
logger.write("Hints:\n")
logger.write(" 1.The following data is organized in (I, min_sup) or (I1, I2, min_conf, min_sup)\n")
logger.write(" 2.All decimals are rounded to 6 decimal places.\n")
logger.write("-----------------------Frequent Item Sets-----------------------\n")
for item in support_data.keys():
logger.write(f"{list(item)} {support_data[item]:.6f}\n")
logger.write("-----------------------Strong Association rules-----------------------\n")
for i, big_rule in enumerate(big_rule_list):
I1 = list(big_rule[0])
I2 = list(big_rule[1])
logger.write(f"{i+1} {I1} {I2} {big_rule[2]:.6f} {big_rule[3]:.6f}\n")
logger.write("\n\nAssociation rules mining has completed.\n\n")
logger.write(f"The time spending on Apriori (seconds): {duration:.6f}\n")
total += 1