-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #2 from inSilecoInc/CST-442-refactor
CST-442 refactor ETL functions for all urgenceAviR datasets
- Loading branch information
Showing
26 changed files
with
1,293 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,26 @@ | ||
Package: urgenceAviR | ||
Title: What the Package Does (One Line, Title Case) | ||
Version: 0.0.0.9000 | ||
Title: Integrate birds abundances data | ||
Version: 0.0.1 | ||
Authors@R: c( | ||
person("Clara", "Casabona", role = "ctb"), | ||
person("Matthieu", "Beaumont", email = "[email protected]", role = c("aut", "cre", "cph")), | ||
person("Steve", "Vissault", email = "[email protected]", role = "aut") | ||
) | ||
Description: What the package does (one paragraph). | ||
Description: A package designed to gather bird surveys and integrate them. The newly created database is used to assess environmental threat levels. It includes tools for loading, | ||
processing, and analyzing spatial datasets, as well as integration with | ||
various taxonomic reference systems. | ||
Imports: | ||
cli, | ||
dplyr, | ||
foreign, | ||
glue, | ||
terra, | ||
sf | ||
janitor, | ||
lubridate, | ||
readxl, | ||
stringr, | ||
stringi, | ||
tibble, | ||
terra | ||
License: MIT + file LICENSE | ||
Encoding: UTF-8 | ||
Roxygen: list(markdown = TRUE) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,13 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(get_ebird) | ||
export(get_species_rank) | ||
export(load_all_datasets) | ||
export(load_biomq) | ||
export(load_canards) | ||
export(load_eider_hiver) | ||
export(load_garrot) | ||
export(load_macreuse) | ||
export(load_oies) | ||
export(load_sauvagine_fleuve) | ||
export(load_somec) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
#' Clean and Process Species Codes | ||
#' | ||
#' This file contains functions to clean and process species codes from a DBF file, | ||
#' join them with metadata, and optionally drop subspecies from the output. | ||
#' | ||
#' @title Clean and Process Species Codes | ||
#' @description Functions to clean and process species codes from a DBF file, join them with metadata, and optionally drop subspecies. | ||
#' | ||
#' @param species_path Path to the DBF file containing species codes. | ||
#' @param metadata_path Path to the CSV file containing metadata. | ||
#' @param drop_subspecies Logical, whether to drop subspecies from the resulting data frame (default: TRUE). | ||
#' | ||
#' @return A cleaned data frame of species codes with optional subspecies filtered out. | ||
#' @examples | ||
#' \dontrun{ | ||
#' species_ref <- get_species_codes( | ||
#' species_path = "data/CodesEspeces.dbf", | ||
#' metadata_path = "data/metadata_species.csv", | ||
#' drop_subspecies = TRUE | ||
#' ) | ||
#' } | ||
get_species_codes <- function(species_path = external_files$species_codes$path, metadata_path = external_files$species_metadata$path, drop_subspecies = TRUE) { | ||
|
||
# assert file exists | ||
if (!file.exists(species_path)) { | ||
cli::cli_abort("Could not find file { species_path }") | ||
} | ||
|
||
if(!file.exists(metadata_path)) { | ||
cli::cli_abort("Could not find file { metadata_path }") | ||
} | ||
|
||
# Load species codes | ||
species_code <- foreign::read.dbf(species_path, as.is = TRUE) |> tibble::as_tibble() | ||
|
||
# Filter species for birds and prepare columns | ||
species_code <- species_code |> | ||
dplyr::filter(Groupe_FR == "Oiseaux") |> # Only keep birds | ||
dplyr::select(Nom_FR, Nom_Scient, Name_EN, Code4_EN, Code4_FR, Alpha_Code, SousGroupe, STATUT_COS) |> | ||
dplyr::filter(!is.na(SousGroupe)) |> | ||
dplyr::distinct() | ||
|
||
# Read metadata and select columns | ||
metadata_sp <- read.csv(metadata_path, encoding = "UTF-8") |> | ||
dplyr::select(Name_SC, Species_ID) | ||
|
||
# Add species ID to species codes | ||
species_code <- dplyr::left_join(species_code, metadata_sp, by = c("Nom_Scient" = "Name_SC")) | ||
|
||
# Create reference table | ||
species_ref <- species_code |> | ||
dplyr::mutate( | ||
# Create CODE_ID column | ||
CODE_ID = dplyr::case_when( | ||
!is.na(Code4_EN) ~ Code4_EN, | ||
!is.na(Species_ID) ~ Species_ID, | ||
TRUE ~ NA_character_ # Explicitly handle NA cases | ||
), | ||
# Rank species | ||
category = sapply(Nom_Scient, get_species_rank), | ||
Nom_FR = stringr::str_replace( | ||
stringi::stri_trans_general(Nom_FR, "latin-ascii"), | ||
"non identifie|non identife|sp\\.e", "sp." | ||
) |> tolower() | ||
) |> | ||
dplyr::select(CODE_ID, Nom_Scient, STATUT_COS, Nom_FR, Code4_FR, Code4_EN, Alpha_Code, category) |> | ||
janitor::clean_names() | ||
|
||
# Optionally drop subspecies | ||
if (isTRUE(drop_subspecies)) { | ||
species_ref <- species_ref |> dplyr::filter(category != "subspecies") | ||
} | ||
|
||
cli::cli_alert_info("Load { nrow(species_ref) } species from species reference table") | ||
|
||
return(species_ref) | ||
} | ||
|
||
#' @title Get species rank | ||
#' @description Get species rank based on scientific name | ||
#' | ||
#' @param scientific_name The scientific name of the species to be ranked | ||
#' | ||
#' @return A character string indicating the rank: "genre", "subspecies", or "species". | ||
#' @examples | ||
#' get_species_rank("Aquila chrysaetos") | ||
#' | ||
#' @export | ||
get_species_rank <- function(scientific_name) { | ||
if (grepl("sp\\.", scientific_name)) { | ||
return("genre") | ||
} else if (length(strsplit(scientific_name, "\\s+")[[1]]) == 3) { | ||
return("subspecies") | ||
} else { | ||
return("species") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
#' Load and process all datasets | ||
#' | ||
#' This function sequentially calls all the `load_*` dataset processing functions | ||
#' and combines their outputs into a single `data.frame` or a list of `data.frames`. | ||
#' | ||
#' @param combine Logical, if TRUE (default), combines all datasets into a single `data.frame`. | ||
#' If FALSE, returns a named list of individual `data.frames`. | ||
#' @return A combined `data.frame` or a list of `data.frames` containing all processed datasets. | ||
#' @examples | ||
#' \dontrun{ | ||
#' all_data <- load_all_datasets() | ||
#' } | ||
#' @export | ||
load_all_datasets <- function(combine = TRUE) { | ||
|
||
# List of dataset functions | ||
dataset_functions <- list( | ||
canards_de_mer = load_canards, | ||
eider_hiver = load_eider_hiver, | ||
garrot = load_garrot, | ||
macreuse = load_macreuse, | ||
oies = load_oies, | ||
sauvagine_fleuve = load_sauvagine_fleuve, | ||
biomq = load_biomq | ||
) | ||
|
||
cli::cli_h1("Loading all datasets") | ||
|
||
# Initialize an empty list to store results | ||
datasets <- list() | ||
|
||
# Call each function and store its result | ||
for (name in names(dataset_functions)) { | ||
tryCatch( | ||
{ | ||
datasets[[name]] <- dataset_functions[[name]]() | ||
}, | ||
error = function(e) { | ||
cli::cli_alert_danger("Failed to load dataset: {name}. Error: {e$message}") | ||
} | ||
) | ||
} | ||
|
||
# Combine datasets if requested | ||
if (combine) { | ||
cli::cli_h1("Combining datasets") | ||
combined_data <- dplyr::bind_rows(datasets, .id = "dataset") | ||
cli::cli_alert_success("Successfully combined datasets") | ||
return(combined_data) | ||
} | ||
|
||
cli::cli_alert_success("Returning datasets as a list") | ||
return(datasets) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
#' Load and process the BIOMQ dataset | ||
#' | ||
#' This function loads and processes the "BIOMQ" dataset from a predefined external Excel file. | ||
#' It validates the file and columns, applies transformations, and integrates species codes using a reference table. | ||
#' | ||
#' @return A processed `data.frame` with standardized columns and integrated species codes. | ||
#' @examples | ||
#' \dontrun{ | ||
#' processed_biomq <- load_biomq() | ||
#' } | ||
#' @export | ||
load_biomq <- function() { | ||
|
||
cli::cli_h1("BIOMQ") | ||
cli::cli_alert_info("Starting integration procedure on {external_files$biomq$path}") | ||
|
||
# Assert file exists | ||
if (!file.exists(external_files$biomq$path)) { | ||
cli::cli_abort("Could not find file: {external_files$biomq$path}") | ||
} | ||
|
||
# Load data from the second sheet | ||
biomq <- readxl::read_excel(path = external_files$biomq$path, sheet = 2) |> tibble::as_tibble() | ||
|
||
# Assert columns exist | ||
missing_cols <- setdiff(external_files$biomq$check_columns, names(biomq)) | ||
if (length(missing_cols) > 0) { | ||
cli::cli_abort(c( | ||
"Missing required columns in dataset:", | ||
paste(missing_cols, collapse = ", ") | ||
)) | ||
} | ||
|
||
cli::cli_alert_info("Applying transformation on {nrow(biomq)} rows") | ||
|
||
# Select and rename columns | ||
biomq <- biomq |> | ||
dplyr::select( | ||
NomCol, CentroideX, CentroideY, NomFR, | ||
nb_nicheur, methode, nomRef, AnneeDebut, | ||
MoisDebut, JourDebut | ||
) |> | ||
dplyr::rename( | ||
locality = NomCol, | ||
longitude = CentroideX, | ||
latitude = CentroideY, | ||
abondance = nb_nicheur, | ||
inv_type = methode, | ||
obs = nomRef, | ||
year = AnneeDebut, | ||
month = MoisDebut, | ||
day = JourDebut | ||
) |> | ||
dplyr::mutate( | ||
date = lubridate::make_date(year, month, day), | ||
source = "BIOMQ", | ||
nom_fr = tolower(NomFR), | ||
link = external_files$biomq$path, | ||
colony = TRUE | ||
) | ||
|
||
# Enforce sp. | ||
biomq <- biomq |> | ||
dplyr::mutate( | ||
nom_fr = stringr::str_replace_all( | ||
stringi::stri_trans_general(nom_fr, "latin-ascii") |> tolower(), | ||
c( | ||
"goelands" = "goeland sp.", | ||
"sternes" = "sterne sp.", | ||
"cormorans" = "cormoran sp." | ||
)) | ||
) | ||
|
||
# Join TAXO - Match CODE_ID using Nom_FR | ||
biomq <- biomq |> | ||
dplyr::left_join( | ||
dplyr::select(get_species_codes(), code_id, nom_fr) |> | ||
dplyr::mutate(nom_fr = tolower(nom_fr)) |> | ||
dplyr::distinct(), | ||
by = "nom_fr", | ||
na_matches = "never" | ||
) |> | ||
dplyr::mutate( | ||
code_id = ifelse( | ||
nom_fr %in% names(equivalences), | ||
equivalences[nom_fr], | ||
code_id | ||
) | ||
) | ||
|
||
# Drop non-relevant coordinates | ||
biomq <- biomq |> | ||
dplyr::filter( | ||
longitude >= -100 & longitude <= -30, | ||
latitude >= 30 & latitude <= 70 | ||
) | ||
|
||
# Adjust CODE_ID using equivalences_minuscule | ||
biomq <- biomq |> | ||
dplyr::mutate( | ||
code_id = ifelse( | ||
nom_fr %in% names(equivalences), | ||
equivalences[nom_fr], | ||
code_id | ||
) | ||
) | ||
|
||
# Re-order cols | ||
biomq <- dplyr::select(biomq, dplyr::all_of(final_cols)) | ||
|
||
cli::cli_alert_success("Returning {nrow(biomq)} rows") | ||
return(biomq) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
|
||
#' Load and process the Canards de Mer canards | ||
#' | ||
#' This function loads and processes the "Canards de Mer" canards from a predefined external source. | ||
#' It standardizes column names, performs transformations, and integrates species codes using a reference table. | ||
#' | ||
#' @return A processed `data.frame` with standardized columns and integrated species codes. | ||
#' @examples | ||
#' \dontrun{ | ||
#' processed_canards <- load_canards() | ||
#' } | ||
#' @export | ||
load_canards <- function() { | ||
|
||
cli::cli_h2("Canards de mer") | ||
cli::cli_alert_info("Starting integration procedure on { external_files$canards_de_mer$path }") | ||
|
||
# assert file exists | ||
if(!file.exists(external_files$canards_de_mer$path)) { | ||
cli::cli_abort("Could not find file { external_files$canards_de_mer$path }") | ||
} | ||
|
||
# Read file | ||
canards <- read.csv2(external_files$canards_de_mer$path) |> tibble::as_tibble() | ||
|
||
# assert columns exist | ||
missing_cols <- setdiff(external_files$canards_de_mer$check_columns, names(canards)) | ||
if (length(missing_cols) > 0) { | ||
cli::cli_abort(c( | ||
"Missing required columns in dataset:", | ||
paste(missing_cols, collapse = ", ") | ||
)) | ||
} | ||
|
||
cli::cli_alert_info("Applying transformation on { nrow(canards) } rows") | ||
|
||
canards <- canards |> | ||
dplyr::mutate(date_obs = lubridate::make_date(Annee, Mois, Jour)) |> | ||
dplyr::select( | ||
latitude = "LATITUDE", | ||
longitude = "LONGITUDE", | ||
locality = "NomLieu", | ||
date = "date_obs", | ||
abondance = "NombreTotal", | ||
obs = "NomObservateur", | ||
inv_type = "Signification.1", | ||
nom_fr = "Nom_FR" | ||
) |> | ||
dplyr::mutate( | ||
latitude = as.numeric(latitude), | ||
longitude = as.numeric(longitude), | ||
nom_fr = stringi::stri_trans_general(gsub("sp", "sp\\.", nom_fr, ignore.case = TRUE), "latin-ascii") |> | ||
tolower(), | ||
source = "Canards de mer", | ||
colony = FALSE, | ||
link = external_files$canards_de_mer$path | ||
) | ||
|
||
# Join TAXO - Match code_id using nom_fr | ||
canards <- canards |> | ||
dplyr::left_join( | ||
dplyr::select(get_species_codes(), code_id, nom_fr) |> dplyr::distinct(), | ||
by = "nom_fr", | ||
na_matches = "never" | ||
) |> | ||
dplyr::mutate( | ||
code_id = ifelse( | ||
nom_fr %in% names(equivalences), | ||
equivalences[nom_fr], | ||
code_id | ||
) | ||
) | ||
|
||
# Re-order cols | ||
canards <- dplyr::select(canards, dplyr::all_of(final_cols)) | ||
|
||
cli::cli_alert_success("Returning { nrow(canards) } rows") | ||
|
||
return(canards) | ||
|
||
} |
Oops, something went wrong.