Skip to content

Commit

Permalink
fix errors in evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
jhljx committed Sep 7, 2020
1 parent 48a8ff6 commit 61738ef
Show file tree
Hide file tree
Showing 5 changed files with 308 additions and 65 deletions.
56 changes: 24 additions & 32 deletions evaluation/centrality_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from utils import check_and_make_path


# Generate data used for equivalence prediction
# Generate data used for centrality prediction
class DataGenerator(object):
base_path: str
input_base_path: str
Expand Down Expand Up @@ -103,23 +103,23 @@ def generate_all_node_samples(self, sep='\t', worker=-1):
pool.join()


# Equivalence predictor class
class EquivalencePredictor(object):
# Centrality predictor class
class CentralityPredictor(object):
base_path: str
origin_base_path: str
embedding_base_path: str
equ_base_path: str
centrality_base_path: str
output_base_path: str
file_sep: str
full_node_list: list
alpha_list: list
split_fold: int

def __init__(self, base_path, origin_folder, embedding_folder, equ_folder, output_folder, node_file, file_sep='\t', alpha_list=None, split_fold=5):
def __init__(self, base_path, origin_folder, embedding_folder, centrality_folder, output_folder, node_file, file_sep='\t', alpha_list=None, split_fold=5):
self.base_path = base_path
self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder))
self.equ_base_path = os.path.abspath(os.path.join(base_path, equ_folder))
self.centrality_base_path = os.path.abspath(os.path.join(base_path, centrality_folder))
self.output_base_path = os.path.abspath(os.path.join(base_path, output_folder))
self.file_sep = file_sep

Expand All @@ -137,16 +137,7 @@ def get_prediction_error(self, centrality_data, embeddings, date):
centrality_list = ['closeness', 'betweenness', 'eigenvector', 'kcore']
mse_list = [date]
for i, centrality in enumerate(centrality_list):
# model = LinearRegression(n_jobs=-1)
# min_error = float("inf")
# # only using a certain cv value may cause some method have very large MSE, so we need to try different cv values
# # for cv in [2, 3, 4, 5, 6, 7, 8, 9, 10, 20]:
# y_pred = cross_val_predict(model, embeddings, centrality_data[:, i], cv=5)
# error = mean_squared_error(centrality_data[:, i], y_pred) / np.mean(centrality_data[:, i])
# min_error = min(min_error, error)

min_error = float("inf")
# for alpha in [0.05, 0.5, 1, 2, 5, 10]:
for alpha in self.alpha_list:
model = Ridge(alpha=alpha)
y_pred = cross_val_predict(model, embeddings, centrality_data[:, i], cv=self.split_fold)
Expand All @@ -155,18 +146,19 @@ def get_prediction_error(self, centrality_data, embeddings, date):
mse_list.append(min_error)
return mse_list

def equivalence_prediction_all_time(self, method):
def centrality_prediction_all_time(self, method):
print('method = ', method)
f_list = sorted(os.listdir(self.origin_base_path))
all_mse_list = []
for i, f_name in enumerate(f_list):
print('Current date is: {}'.format(f_name))
date = f_name.split('.')[0]
df_centrality = pd.read_csv(os.path.join(self.equ_base_path, date + '_centrality.csv'), sep=self.file_sep)
df_centrality = pd.read_csv(os.path.join(self.centrality_base_path, date + '_centrality.csv'), sep=self.file_sep)
centrality_data= df_centrality.iloc[:, 1:].values
if not os.path.exists(os.path.join(self.embedding_base_path, method, f_name)):
cur_embedding_path = os.path.join(self.embedding_base_path, method, f_name)
if not os.path.exists(cur_embedding_path):
continue
df_embedding = pd.read_csv(os.path.join(self.embedding_base_path, method, f_name), sep=self.file_sep, index_col=0)
df_embedding = pd.read_csv(cur_embedding_path, sep=self.file_sep, index_col=0)
df_embedding = df_embedding.loc[self.full_node_list]
embeddings = df_embedding.values
mse_list = self.get_prediction_error(centrality_data, embeddings, date)
Expand All @@ -181,49 +173,49 @@ def equivalence_prediction_all_time(self, method):
output_file_path = os.path.join(self.output_base_path, method + '_mse_record.csv')
df_output.to_csv(output_file_path, sep=',', index=False)

def equivalence_prediction_all_method(self, method_list=None, worker=-1):
print('Start equivalence_prediction!')
def centrality_prediction_all_method(self, method_list=None, worker=-1):
print('Start graph centrality prediction!')
if method_list is None:
method_list = os.listdir(self.embedding_base_path)

if worker <= 0:
for method in method_list:
print('Current method is :{}'.format(method))
self.equivalence_prediction_all_time(method)
self.centrality_prediction_all_time(method)
else:
worker = min(worker, os.cpu_count())
pool = multiprocessing.Pool(processes=worker)
print("start " + str(worker) + " worker(s)")

for method in method_list:
pool.apply_async(self.equivalence_prediction_all_time, (method,))
pool.apply_async(self.centrality_prediction_all_time, (method,))
pool.close()
pool.join()
print('Finish equivalence_prediction!')
print('Finish graph centrality prediction!')


def equivalence_prediction(args):
def centrality_prediction(args):
base_path = args['base_path']
origin_folder = args['origin_folder']
embedding_folder = args['embed_folder']
node_file = args['node_file']
equ_data_folder = args['equpred_data_folder']
equ_res_folder = args['equpred_res_folder']
centrality_data_folder = args['centrality_data_folder']
centrality_res_folder = args['centrality_res_folder']
file_sep = args['file_sep']
generate = args['generate']
method_list = args['method_list']
alpha_list = args['alpha_list']
split_fold = args['split_fold'] # cross validation split fold
worker = args['worker']

data_generator = DataGenerator(base_path=base_path, input_folder=origin_folder, output_folder=equ_data_folder, node_file=node_file, file_sep=file_sep)
data_generator = DataGenerator(base_path=base_path, input_folder=origin_folder, output_folder=centrality_data_folder, node_file=node_file, file_sep=file_sep)
if generate:
data_generator.generate_all_node_samples(sep=file_sep, worker=worker)

equivalence_predictor = EquivalencePredictor(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, equ_folder=equ_data_folder,
output_folder=equ_res_folder, node_file=node_file, file_sep=file_sep, alpha_list=alpha_list, split_fold=split_fold)
centrality_predictor = CentralityPredictor(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, centrality_folder=centrality_data_folder,
output_folder=centrality_res_folder, node_file=node_file, file_sep=file_sep, alpha_list=alpha_list, split_fold=split_fold)

t1 = time.time()
equivalence_predictor.equivalence_prediction_all_method(method_list=method_list, worker=worker)
centrality_predictor.centrality_prediction_all_method(method_list=method_list, worker=worker)
t2 = time.time()
print('equivalence prediction cost time: ', t2 - t1, ' seconds!')
print('centrality prediction cost time: ', t2 - t1, ' seconds!')
23 changes: 12 additions & 11 deletions evaluation/edge_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class DataGenerator(object):
full_node_list: list
node2idx_dict: dict
node_num: int
label_dict: dict
train_ratio: float
val_ratio: float
test_ratio: float
Expand All @@ -38,7 +37,6 @@ def __init__(self, base_path, input_folder, output_folder, node_file, label_fold
self.full_node_list = nodes_set['node'].tolist()
self.node2idx_dict = dict(zip(self.full_node_list, np.arange(self.node_num)))
self.node_num = len(self.full_node_list)
self.label_dict = dict()

assert train_ratio + test_ratio + val_ratio <= 1.0
self.train_ratio = train_ratio
Expand All @@ -57,9 +55,6 @@ def generate_edge_samples(self, file_name, sep='\t'):
df_edges[['from_id', 'to_id']] = df_edges[['from_id', 'to_id']].applymap(lambda x: self.node2idx_dict[x])
edge_arr = df_edges[['from_id', 'to_id']].values
label_arr = df_edges['label'].values
unique_labels = df_edges['label'].unique()
for label in unique_labels:
self.label_dict[label] = 1

edge_indices = np.arange(edge_num)
np.random.shuffle(edge_indices)
Expand Down Expand Up @@ -119,7 +114,7 @@ class EdgeClassifier(object):
C_list: list
max_iter: int

def __init__(self, base_path, origin_folder, embedding_folder, edgeclas_folder, output_folder, node_file, unique_labels, file_sep='\t', C_list=None, max_iter=5000):
def __init__(self, base_path, origin_folder, embedding_folder, edgeclas_folder, output_folder, node_file, label_folder, file_sep='\t', C_list=None, max_iter=5000):
self.base_path = base_path
self.origin_base_path = os.path.abspath(os.path.join(base_path, origin_folder))
self.embedding_base_path = os.path.abspath(os.path.join(base_path, embedding_folder))
Expand All @@ -129,8 +124,13 @@ def __init__(self, base_path, origin_folder, embedding_folder, edgeclas_folder,

node_file_path = os.path.abspath(os.path.join(base_path, node_file))
nodes_set = pd.read_csv(node_file_path, names=['node'])
self.full_node_list = nodes_set['edge'].tolist()
self.unique_labels = unique_labels
self.full_node_list = nodes_set['node'].tolist()
self.label_base_path = os.path.abspath(os.path.join(base_path, label_folder))
f_list = os.listdir(self.label_base_path)
assert len(f_list) > 0
label_path = os.path.join(self.label_base_path, f_list[0])
df_label = pd.read_csv(label_path, sep=file_sep)
self.unique_labels = df_label['label'].unique()
self.C_list = C_list
self.max_iter = max_iter

Expand Down Expand Up @@ -191,9 +191,10 @@ def edge_classification_all_time(self, method):
train_edges = pd.read_csv(os.path.join(self.edgeclas_base_path, date + '_train.csv'), sep=self.file_sep).values
val_edges = pd.read_csv(os.path.join(self.edgeclas_base_path, date + '_val.csv'), sep=self.file_sep).values
test_edges = pd.read_csv(os.path.join(self.edgeclas_base_path, date + '_test.csv'), sep=self.file_sep).values
if not os.path.exists(os.path.join(self.embedding_base_path, method, f_name)):
cur_embedding_path = os.path.join(self.embedding_base_path, method, f_name)
if not os.path.exists(cur_embedding_path):
continue
df_embedding = pd.read_csv(os.path.join(self.embedding_base_path, method, f_name), sep=self.file_sep, index_col=0)
df_embedding = pd.read_csv(cur_embedding_path, sep=self.file_sep, index_col=0)
df_embedding = df_embedding.loc[self.full_node_list]
embeddings = df_embedding.values

Expand Down Expand Up @@ -286,7 +287,7 @@ def edge_classification(args):
data_generator.generate_edge_samples_all_time(sep=file_sep, worker=worker)

edge_classifier = EdgeClassifier(base_path=base_path, origin_folder=origin_folder, embedding_folder=embedding_folder, edgeclas_folder=edgecls_data_folder + '_' + str(i),
output_folder=edgecls_res_folder + '_' + str(i), node_file=node_file, file_sep=file_sep, unique_labels=list(data_generator.label_dict.keys()), C_list=C_list, max_iter=max_iter)
output_folder=edgecls_res_folder + '_' + str(i), node_file=node_file, label_folder=elabel_folder, file_sep=file_sep, C_list=C_list, max_iter=max_iter)
edge_classifier.edge_classification_all_method(method_list=method_list, worker=worker)

t2 = time.time()
Expand Down
15 changes: 6 additions & 9 deletions evaluation/link_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,9 @@
import multiprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from utils import check_and_make_path, get_neg_edge_samples
from utils import check_and_make_path, get_neg_edge_samples, sigmoid


def sigmoid(x):
return 1 / (1 + np.exp(-x))

# Generate data used for link prediction
class DataGenerator(object):
base_path: str
Expand Down Expand Up @@ -223,11 +220,12 @@ def link_prediction_all_time(self, method):
val_edges = pd.read_csv(os.path.join(self.lp_edge_base_path, date + '_val.csv'), sep=self.file_sep).values
test_edges = pd.read_csv(os.path.join(self.lp_edge_base_path, date + '_test.csv'), sep=self.file_sep).values
pre_f_name = f_list[i - 1]
if not os.path.exists(os.path.join(self.embedding_base_path, method, pre_f_name)):
pre_embedding_path = os.path.join(self.embedding_base_path, method, pre_f_name)
if not os.path.exists(pre_embedding_path):
continue
# print('pre_f_name: ', f_list[i - 1], ', f_name: ', f_name)
print('Current date is: {}'.format(f_name))
df_embedding = pd.read_csv(os.path.join(self.embedding_base_path, method, pre_f_name), sep=self.file_sep, index_col=0)
df_embedding = pd.read_csv(pre_embedding_path, sep=self.file_sep, index_col=0)
df_embedding = df_embedding.loc[self.full_node_list, :]
# node_num = len(self.full_node_list)
# for j in range(node_num):
Expand Down Expand Up @@ -274,15 +272,14 @@ def aggregate_results(base_path, lp_res_folder, start_idx, rep_num, method_list,
for method in method_list:
res_base_path = os.path.join(base_path, lp_res_folder + '_' + str(start_idx))
res_path = os.path.join(res_base_path, method + '_auc_record.csv')

column_names = ['date'] + [measure + '_' + str(start_idx) for measure in measure_list]
df_method = pd.read_csv(res_path, sep=',', header=0, names=column_names)
measure_df_dict = dict()
for measure in measure_list:
df_measure = df_method.loc[:, ['date', measure + '_' + str(start_idx)]].copy()
measure_df_dict[measure] = df_measure
for i in range(start_idx + 1, start_idx + rep_num):
res_base_path = os.path.join(base_path, 'link_prediction_res_' + str(i))
res_base_path = os.path.join(base_path, lp_res_folder + '_' + str(i))
res_path = os.path.join(res_base_path, method + '_auc_record.csv')
column_names = ['date'] + [measure + '_' + str(i) for measure in measure_list]
df_rep = pd.read_csv(res_path, sep=',', header=0, names=column_names)
Expand All @@ -295,7 +292,7 @@ def aggregate_results(base_path, lp_res_folder, start_idx, rep_num, method_list,
measure_column = [measure + '_' + str(i) for i in range(start_idx, start_idx + rep_num)]
df_measure = measure_df_dict[measure]
df_measure['avg'] = df_measure.loc[:, measure_column].mean(axis=1)
measure_df_dict[measure]['max'] = df_measure.loc[:, measure_column].max(axis=1)
df_measure['max'] = df_measure.loc[:, measure_column].max(axis=1)
df_measure['min'] = df_measure.loc[:, measure_column].min(axis=1)
output_path = os.path.join(output_base_path, method + '_' + measure + '_record.csv')
df_measure.to_csv(output_path, sep=',', index=False)
Expand Down
Loading

0 comments on commit 61738ef

Please sign in to comment.