-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_extraction.py
103 lines (77 loc) · 4.06 KB
/
data_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import os.path
import numpy as np
import pandas as pd
from tqdm import tqdm
INPUT_DIR = "cleaned_data/multiple_records/"
OUTPUT_DIR = "new_extracted_data/"
os.makedirs(OUTPUT_DIR, exist_ok=True)
def load_interested_columns(file_path: str = "Unnecessary files.xlsx") -> pd.DataFrame:
df = pd.read_excel(file_path, sheet_name="Sheet1", index_col=None)
df = df[df["variable1"].notnull()]
df.drop(["Num of unique_ids"], axis=1, inplace=True)
return df
def extract_data(file_name: str, interested_column: str) -> None:
df = pd.read_csv(os.path.join(INPUT_DIR, file_name), index_col=None)
if interested_column in ["valoreMisura", "IcssIndiceCelluleSomatiche"]:
df = df[["idAnimale", interested_column]]
result_df = pd.DataFrame(columns=["idAnimale", "mean", "std", "count", "num_non_positive_records"])
unique_animal_ids = list(set(df["idAnimale"]))
for animal_id in unique_animal_ids:
temp_df = df[df["idAnimale"] == animal_id]
num_non_positive_records = len(temp_df[temp_df[interested_column] <= 0])
temp_df = temp_df[temp_df[interested_column] > 0]
data_entries = temp_df[interested_column]
mean = np.mean(data_entries)
std = np.std(data_entries)
temp_df = temp_df[(temp_df[interested_column] > mean - 3 * std) &
(temp_df[interested_column] < mean + 3 * std)]
data_entries = temp_df[interested_column]
mean = np.mean(data_entries)
std = np.std(data_entries)
count = len(data_entries)
result_df.loc[len(result_df)] = {
"idAnimale": animal_id,
"mean": mean,
"std": std,
"count": count,
"num_non_positive_records": num_non_positive_records
}
result_df.to_csv(os.path.join(OUTPUT_DIR, file_name), index=False)
# raise KeyboardInterrupt
elif file_name == "Pregnancy Diagnosis.csv":
df = df[["idAnimale", interested_column]]
result_df = pd.DataFrame(columns=["idAnimale", "num_positive_diagnoses"])
unique_animal_ids = list(set(df["idAnimale"]))
for animal_id in unique_animal_ids:
temp_df = df[df["idAnimale"] == animal_id]
positive_diagnoses = len(temp_df[temp_df[interested_column] == "POSITIVA"])
result_df.loc[len(result_df)] = {
"idAnimale": animal_id,
"num_positive_diagnoses": positive_diagnoses
}
result_df.to_csv(os.path.join(OUTPUT_DIR, file_name), index=False)
elif "," in interested_column:
interested_columns = ["idAnimale"] + interested_column.replace(" ", "").split(",")
df = df[interested_columns]
result_df = pd.DataFrame(columns=["idAnimale", "num_successful_pregnancy", "num_problematic_pregnancy"])
unique_animal_ids = list(set(df["idAnimale"]))
for animal_id in unique_animal_ids:
temp_df = df[df["idAnimale"] == animal_id]
temp_df = temp_df.fillna(0)
sum_male_calves_born_alive = sum(temp_df["NumeroMaschiNatiVivi"])
sum_female_calves_born_alive = sum(temp_df["NumeroFemmineNateVive"])
sum_male_calves_born_dead = sum(temp_df["NumeroMaschiNatiMorti"])
sum_female_calves_born_dead = sum(temp_df["NumeroFemmineNateMorte"])
result_df.loc[len(result_df)] = {
"idAnimale": animal_id,
"num_successful_pregnancy": int(sum_male_calves_born_alive + sum_female_calves_born_alive),
"num_problematic_pregnancy": int(sum_male_calves_born_dead + sum_female_calves_born_dead)
}
result_df.to_csv(os.path.join(OUTPUT_DIR, file_name), index=False)
if __name__ == "__main__":
file_mapping = load_interested_columns()
for _, row_data in tqdm(file_mapping.iterrows(),
total=file_mapping.shape[0],
desc="Extracting data... "
):
extract_data(row_data["File"], row_data["variable1"])