Skip to content

Commit

Permalink
Preparing new release
Browse files Browse the repository at this point in the history
  • Loading branch information
Nikoletos-K committed Feb 25, 2025
1 parent 180813a commit 943647e
Show file tree
Hide file tree
Showing 10 changed files with 220 additions and 110 deletions.
2 changes: 2 additions & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ parts:
title: Clean-Clean ER for Devs
- file: tutorials/CleanCleanERwithoutGT.ipynb
title: Clean-Clean ER without GT
- file: tutorials/Reproducibility.ipynb
title: Academic reproducibility
- file: tutorials/DirtyER.ipynb
title: Dirty ER for Devs
- file: tutorials/Optuna.ipynb
Expand Down
46 changes: 25 additions & 21 deletions docs/pyjedai/block_building.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,38 +236,42 @@ def stats(self, blocks: dict, verbose: bool = True) -> dict:
'skewness_of_comparison_per_entity': self.skewness_of_comparison_per_entity
}

def export_to_df(
self,
blocks: dict
) -> pd.DataFrame:
"""creates a dataframe for the evaluation report
def export_to_df(self, blocks: dict, tqdm_enable:bool = False) -> pd.DataFrame:
"""Creates a dataframe for the evaluation report.
Args:
blocks (any): Predicted blocks
data (Data): initial dataset
blocks (dict): Predicted blocks.
Returns:
pd.DataFrame: Dataframe predicted pairs (can be exported to csv)
pd.DataFrame: Dataframe with the predicted pairs (can be exported to CSV).
"""
pairs_df = pd.DataFrame(columns=['id1', 'id2'])
for _, block in blocks.items():
if self.data.is_dirty_er:
pairs_list = []

is_dirty_er = self.data.is_dirty_er
gt_to_ids_reversed_1 = self.data._gt_to_ids_reversed_1
gt_to_ids_reversed_2 = self.data._gt_to_ids_reversed_2

for block in tqdm(blocks.values(), desc="Exporting to DataFrame", disable=not tqdm_enable):
if is_dirty_er:
lblock = list(block.entities_D1)
for i1 in range(0, len(lblock)):
for i2 in range(i1+1, len(lblock)):
id1 = self.data._gt_to_ids_reversed_1[lblock[i1]]
id2 = self.data._gt_to_ids_reversed_1[lblock[i2]] if self.data.is_dirty_er \
else self.data._gt_to_ids_reversed_2[lblock[i2]]
pairs_df = pd.concat([pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])], ignore_index=True)

for i1 in range(len(lblock)):
for i2 in range(i1 + 1, len(lblock)):
id1 = gt_to_ids_reversed_1[lblock[i1]]
id2 = gt_to_ids_reversed_1[lblock[i2]]
pairs_list.append((id1, id2))
else:
for i1 in block.entities_D1:
for i2 in block.entities_D2:
id1 = self.data._gt_to_ids_reversed_1[i1]
id2 = self.data._gt_to_ids_reversed_1[i2] if self.data.is_dirty_er \
else self.data._gt_to_ids_reversed_2[i2]
pairs_df = pd.concat([pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])], ignore_index=True)
id1 = gt_to_ids_reversed_1[i1]
id2 = gt_to_ids_reversed_2[i2]
pairs_list.append((id1, id2))

pairs_df = pd.DataFrame(pairs_list, columns=['id1', 'id2'])

return pairs_df


class AbstractBlockBuilding(AbstractBlockProcessing):
"""Abstract class for the block building method
"""
Expand Down
48 changes: 31 additions & 17 deletions docs/pyjedai/clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,31 +376,45 @@ def stats(self) -> None:
def _configuration(self) -> dict:
pass

def export_to_df(self, prediction: list) -> pd.DataFrame:
"""creates a dataframe for the evaluation report
import pandas as pd

def export_to_df(self, prediction: list, tqdm_enable:bool = False) -> pd.DataFrame:
"""Creates a dataframe for the evaluation report.
Args:
prediction (any): Predicted clusters
prediction (list): Predicted clusters.
Returns:
pd.DataFrame: Dataframe containg evaluation scores and stats
pd.DataFrame: Dataframe containing evaluation scores and stats.
"""
pairs_df = pd.DataFrame(columns=['id1', 'id2'])
for cluster in prediction:
pairs_list = []

dataset_limit = self.data.dataset_limit
is_dirty_er = self.data.is_dirty_er
gt_to_ids_reversed_1 = self.data._gt_to_ids_reversed_1
gt_to_ids_reversed_2 = self.data._gt_to_ids_reversed_2

for cluster in tqdm(prediction, desc="Exporting to DataFrame", disable=not tqdm_enable):
lcluster = list(cluster)
for i1 in range(0, len(lcluster)):
for i2 in range(i1+1, len(lcluster)):
if lcluster[i1] < self.data.dataset_limit:
id1 = self.data._gt_to_ids_reversed_1[lcluster[i1]]
id2 = self.data._gt_to_ids_reversed_1[lcluster[i2]] if self.data.is_dirty_er else self.data._gt_to_ids_reversed_2[lcluster[i2]]

for i1 in range(len(lcluster)):
for i2 in range(i1 + 1, len(lcluster)):
node1 = lcluster[i1]
node2 = lcluster[i2]

if node1 < dataset_limit:
id1 = gt_to_ids_reversed_1[node1]
id2 = gt_to_ids_reversed_1[node2] if is_dirty_er else gt_to_ids_reversed_2[node2]
else:
id2 = self.data._gt_to_ids_reversed_2[lcluster[i1]]
id1 = self.data._gt_to_ids_reversed_1[lcluster[i2]]
pairs_df = pd.concat(
[pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])],
ignore_index=True
)
id2 = gt_to_ids_reversed_2[node1]
id1 = gt_to_ids_reversed_1[node2]

pairs_list.append((id1, id2))

pairs_df = pd.DataFrame(pairs_list, columns=['id1', 'id2'])

return pairs_df


def sorted_indicators(self, first_indicator : int, second_indicator : int):
return (first_indicator, second_indicator) if (first_indicator < second_indicator) else (second_indicator, first_indicator)
Expand Down
6 changes: 3 additions & 3 deletions docs/pyjedai/datamodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,12 @@ def __init__(
self.dataset_name_2 = dataset_name_2

# Fill NaN values with empty string
self.dataset_1 = self.dataset_1.astype(str)
self.dataset_1.fillna("", inplace=True)
self.dataset_1 = self.dataset_1.astype(str)
if not self.is_dirty_er:
self.dataset_2 = self.dataset_2.astype(str)
self.dataset_2.fillna("", inplace=True)

self.dataset_2 = self.dataset_2.astype(str)

# Attributes
if attributes_1 is None:
if dataset_1.columns.values.tolist():
Expand Down
41 changes: 28 additions & 13 deletions docs/pyjedai/joins.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,24 +309,40 @@ def _configuration(self) -> dict:
"qgrams": self.qgrams
}

def export_to_df(self, prediction) -> pd.DataFrame:
def export_to_df(self, prediction, tqdm_enable=False) -> pd.DataFrame:
"""creates a dataframe with the predicted pairs
Args:
prediction (any): Predicted candidate pairs
prediction (any): Predicted candidate pairs,
tqdm_enable (bool, optional): Enable tqdm. Defaults to False.
Returns:
pd.DataFrame: Dataframe with the predicted pairs
"""
if self.data.ground_truth is None:
raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
Data object mush have initialized with the ground-truth file")
pairs_df = pd.DataFrame(columns=['id1', 'id2'])
for edge in prediction.edges:
id1 = self.data._gt_to_ids_reversed_1[edge[0]]
id2 = self.data._gt_to_ids_reversed_1[edge[1]] if self.data.is_dirty_er \
else self.data._gt_to_ids_reversed_2[edge[1]]
pairs_df = pd.concat([pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])], ignore_index=True)
pairs_list = []

is_dirty_er = self.data.is_dirty_er
dataset_limit = self.data.dataset_limit
gt_to_ids_reversed_1 = self.data._gt_to_ids_reversed_1
gt_to_ids_reversed_2 = self.data._gt_to_ids_reversed_2

for edge in tqdm(prediction.edges, disable=not tqdm_enable, desc="Exporting to DataFrame"):
node1, node2 = edge

if not is_dirty_er:
if node1 < dataset_limit:
id1 = gt_to_ids_reversed_1[node1]
id2 = gt_to_ids_reversed_2[node2]
else:
id1 = gt_to_ids_reversed_2[node1]
id2 = gt_to_ids_reversed_1[node2]
else:
id1 = gt_to_ids_reversed_1[node1]
id2 = gt_to_ids_reversed_1[node2]

pairs_list.append((id1, id2))

pairs_df = pd.DataFrame(pairs_list, columns=['id1', 'id2'])

return pairs_df

Expand Down Expand Up @@ -416,8 +432,7 @@ def _configuration(self) -> dict:
"tokenization" : self.tokenization,
"qgrams": self.qgrams
}



class PETopKJoin(TopKJoin):
"""Progressive Entity Resolution Top-K class of Joins module
"""
Expand Down
76 changes: 42 additions & 34 deletions docs/pyjedai/matching.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,7 @@ def get_weights_median(self) -> float:
def get_weights_standard_deviation(self) -> float:
return statistics.stdev([w for _, _, w in self.pairs.edges(data='weight')])

def plot_distribution_of_all_weights(self) -> None:
def plot_distribution_of_all_weights(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
plt.figure(figsize=(10, 6))
all_weights = [w for _, _, w in self.pairs.edges(data='weight')]
Expand All @@ -168,9 +168,11 @@ def plot_distribution_of_all_weights(self) -> None:
plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight')
plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight')
plt.legend()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def plot_distribution_of_all_weights_2d(self) -> None:
def plot_distribution_of_all_weights_2d(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
plt.figure(figsize=(10, 6))
all_weights = [w for _, _, w in self.pairs.edges(data='weight')]
Expand All @@ -182,9 +184,11 @@ def plot_distribution_of_all_weights_2d(self) -> None:
plt.axvline(x = self.get_weights_median(), color = 'black', label = 'Median weight')
plt.axvline(x = self.get_weights_avg()+self.get_weights_standard_deviation(), color = 'green', label = 'Average + SD weight')
plt.legend()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def plot_distribution_of_scores(self) -> None:
def plot_distribution_of_scores(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric in graph from entity matching"
def weight_distribution(G):
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Expand Down Expand Up @@ -221,9 +225,11 @@ def weight_distribution(G):
plt.axvline(x = self.get_weights_median()*10, color = 'black', label = 'Median weight')
plt.axvline(x = self.get_weights_avg()*10+self.get_weights_standard_deviation()*10, color = 'green', label = 'Average + SD weight')
plt.legend()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def plot_gt_distribution_of_scores(self) -> None:
def plot_gt_distribution_of_scores(self, save_figure_path=None) -> None:
title = "Distribution of scores with " + self.metric + " metric on ground truth pairs"
def weight_distribution():
bins = [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
Expand Down Expand Up @@ -257,6 +263,8 @@ def weight_distribution():
ax.set_title(title)
ax.set_xlabel('Similarity score range')
fig.tight_layout()
if save_figure_path:
plt.savefig(save_figure_path)
plt.show()

def evaluate(self,
Expand Down Expand Up @@ -294,43 +302,43 @@ def evaluate(self,

def stats(self) -> None:
pass

def export_pairs_to_csv(self, filename: str, with_similarity: bool = True) -> None:
if self.pairs is None:
raise AttributeError("Pairs have not been initialized yet. " +
"Please run the method `run` first.")

with open(filename, 'w') as f:
for e1, e2, similarity in self.pairs.edges(data='weight'):
e1 = self.data._ids_mapping_1[e1] if e1 < self.data.dataset_limit else self.data._ids_mapping_2[e1]
e2 = self.data._ids_mapping_1[e2] if e2 < self.data.dataset_limit else self.data._ids_mapping_2[e2]
if with_similarity:
f.write(f"{e1}, {e2}, {similarity}\n")
else:
f.write(f"{e1}, {e2}\n")
f.close()

def export_to_df(self, prediction: Graph) -> pd.DataFrame:
"""creates a dataframe with the predicted pairs

def export_to_df(self, prediction: Graph, tqdm_enable=False) -> pd.DataFrame:
"""Creates a dataframe with the predicted pairs.
Args:
prediction (any): Predicted graph
prediction (Graph): Predicted graph
tqdm_enable (bool): Whether to enable tqdm progress bar
Returns:
pd.DataFrame: Dataframe with the predicted pairs
"""
if self.data.ground_truth is None:
raise AttributeError("Can not proceed to evaluation without a ground-truth file. \
Data object mush have initialized with the ground-truth file")
pairs_df = pd.DataFrame(columns=['id1', 'id2'])
for edge in prediction.edges:
id1 = self.data._gt_to_ids_reversed_1[edge[0]]
id2 = self.data._gt_to_ids_reversed_1[edge[1]] if self.data.is_dirty_er \
else self.data._gt_to_ids_reversed_2[edge[1]]
pairs_df = pd.concat([pairs_df, pd.DataFrame([{'id1':id1, 'id2':id2}], index=[0])], ignore_index=True)
pairs_list = []

return pairs_df
is_dirty_er = self.data.is_dirty_er
dataset_limit = self.data.dataset_limit
gt_to_ids_reversed_1 = self.data._gt_to_ids_reversed_1
gt_to_ids_reversed_2 = self.data._gt_to_ids_reversed_2

for edge in tqdm(prediction.edges, disable=not tqdm_enable, desc="Exporting to DataFrame"):
node1, node2 = edge

if not is_dirty_er:
if node1 < dataset_limit:
id1 = gt_to_ids_reversed_1[node1]
id2 = gt_to_ids_reversed_2[node2]
else:
id1 = gt_to_ids_reversed_2[node1]
id2 = gt_to_ids_reversed_1[node2]
else:
id1 = gt_to_ids_reversed_1[node1]
id2 = gt_to_ids_reversed_1[node2]

pairs_list.append((id1, id2))

pairs_df = pd.DataFrame(pairs_list, columns=['id1', 'id2'])

return pairs_df

class EntityMatching(AbstractEntityMatching):
"""Calculates similarity from 0.0 to 1.0 for all blocks
Expand All @@ -345,7 +353,7 @@ def __init__(
tokenizer: str = 'white_space_tokenizer',
vectorizer : str = None,
qgram : int = 1,
similarity_threshold: float = 0.5,
similarity_threshold: float = 0.0,
tokenizer_return_unique_values = False, # unique values or not,
attributes: any = None,
) -> None:
Expand Down
Loading

0 comments on commit 943647e

Please sign in to comment.