-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning3.R
116 lines (87 loc) · 3.5 KB
/
data_cleaning3.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
library(dplyr)
# Define the file path
input_path <- "C:/Users/holic/Box/AI Hackathon/AI_Hackathon/cleaned_data/"
output_path <- "C:/Users/holic/Box/AI Hackathon/AI_Hackathon/cleaned_data/single_record/"
report_path <- paste0(output_path, "0report/")
if (!dir.exists(output_path)) {
dir.create(output_path)
}
if (!dir.exists(report_path)) {
dir.create(report_path)
}
# Initialize a data frame to store the report
report <- data.frame(
File = character(),
Original_row_number = integer(),
Unique_row_number = integer(),
Proportion_unique_id = numeric(),
stringsAsFactors = FALSE
)
# List of files
files <- c("Index Longevity.csv",
"Index Persistence Lactation.csv",
"Dual purpose Index.csv",
"Culling.csv",
"Slaughter.csv",
"Unique Identification.csv")
# Loop through files to read and process
for (file in files) {
# Read the file
data <- read.table(paste0(input_path, file), header = TRUE, sep = ",")
original_row_number <- nrow(data)
# Count unique ids
count <- data %>%
group_by(idAnimale) %>%
summarise(count = n())
# Find entries with a unique id
unique_ids <- count %>%
filter(count == 1) %>%
pull(idAnimale)
# Subset data to only include rows with unique idAnimale
data_unique <- data %>%
filter(idAnimale %in% unique_ids)
unique_row_number <- nrow(data_unique)
proportion_unique_id <- unique_row_number / original_row_number
write.csv(data_unique,
file = paste0(output_path, file),
row.names = FALSE)
# Print file name and number of unique ids
cat("Processed file saved as:", paste0(output_path, file), "\n")
# Append the results to the report data frame
report <- rbind(report, data.frame(
File = file,
Original_row_number = original_row_number,
Unique_row_number = unique_row_number,
Proportion_unique_id = round(proportion_unique_id, 4)
))
}
# Save the report as a CSV file in the report directory
write.csv(report, file = paste0(report_path, "unique_id_report.csv"), row.names = FALSE, quote=FALSE)
cat("Report saved as:", paste0(report_path, "unique_id_report.csv"), "\n")
#################################################################
#################################################################
# Birth.csv -> cannot remove all of duplicated animals even though there are multiple entries for one animal
# -> need to decide whether we keep the first entry or last entry or what
birth = read.table("C:/Users/holic/Box/AI Hackathon/AI_Hackathon/cleaned_data/Birth.csv", header=T, sep=",") #43,206
count = birth %>%
group_by(idAnimale) %>%
summarise(count = n())
birth$date_formatted <- as.Date(paste(birth$anno, birth$mese, birth$giorno, sep = "-"), format = "%Y-%m-%d")
birth_unique <- birth %>%
arrange(date_formatted) %>%
distinct(idAnimale, .keep_all = TRUE)
# nrow(birth_unique) #5,112
write.csv(birth_unique,
file = paste0(output_path, "Birth.csv"),
row.names = FALSE)
unique_row_number <- nrow(birth_unique)
proportion_unique_id <- unique_row_number / original_row_number
report <- rbind(report, data.frame(
File = "Birth.csv",
Original_row_number = original_row_number,
Unique_row_number = unique_row_number,
Proportion_unique_id = round(proportion_unique_id, 4)
))
write.csv(report, file = paste0(report_path, "unique_id_report.csv"), row.names = FALSE, quote=FALSE)
# #validation
# birth_unique[which(birth_unique$idAnimale == "-9223372036799170560"),]