Skip to content

Commit

Permalink
Merge pull request #2 from inSilecoInc/CST-442-refactor
Browse files Browse the repository at this point in the history
CST-442 refactor ETL functions for all urgenceAviR datasets
  • Loading branch information
SteveViss authored Jan 19, 2025
2 parents 44941d2 + 1011b2d commit 7b20688
Show file tree
Hide file tree
Showing 26 changed files with 1,293 additions and 5 deletions.
20 changes: 15 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
Package: urgenceAviR
Title: What the Package Does (One Line, Title Case)
Version: 0.0.0.9000
Title: Integrate birds abundances data
Version: 0.0.1
Authors@R: c(
person("Clara", "Casabona", role = "ctb"),
person("Matthieu", "Beaumont", email = "[email protected]", role = c("aut", "cre", "cph")),
person("Steve", "Vissault", email = "[email protected]", role = "aut")
)
Description: What the package does (one paragraph).
Description: A package designed to gather bird surveys and integrate them. The newly created database is used to assess environmental threat levels. It includes tools for loading,
processing, and analyzing spatial datasets, as well as integration with
various taxonomic reference systems.
Imports:
cli,
dplyr,
foreign,
glue,
terra,
sf
janitor,
lubridate,
readxl,
stringr,
stringi,
tibble,
terra
License: MIT + file LICENSE
Encoding: UTF-8
Roxygen: list(markdown = TRUE)
Expand Down
10 changes: 10 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
# Generated by roxygen2: do not edit by hand

export(get_ebird)
export(get_species_rank)
export(load_all_datasets)
export(load_biomq)
export(load_canards)
export(load_eider_hiver)
export(load_garrot)
export(load_macreuse)
export(load_oies)
export(load_sauvagine_fleuve)
export(load_somec)
97 changes: 97 additions & 0 deletions R/get_species_codes.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#' Clean and Process Species Codes
#'
#' This file contains functions to clean and process species codes from a DBF file,
#' join them with metadata, and optionally drop subspecies from the output.
#'
#' @title Clean and Process Species Codes
#' @description Functions to clean and process species codes from a DBF file, join them with metadata, and optionally drop subspecies.
#'
#' @param species_path Path to the DBF file containing species codes.
#' @param metadata_path Path to the CSV file containing metadata.
#' @param drop_subspecies Logical, whether to drop subspecies from the resulting data frame (default: TRUE).
#'
#' @return A cleaned data frame of species codes with optional subspecies filtered out.
#' @examples
#' \dontrun{
#' species_ref <- get_species_codes(
#' species_path = "data/CodesEspeces.dbf",
#' metadata_path = "data/metadata_species.csv",
#' drop_subspecies = TRUE
#' )
#' }
get_species_codes <- function(species_path = external_files$species_codes$path, metadata_path = external_files$species_metadata$path, drop_subspecies = TRUE) {

# assert file exists
if (!file.exists(species_path)) {
cli::cli_abort("Could not find file { species_path }")
}

if(!file.exists(metadata_path)) {
cli::cli_abort("Could not find file { metadata_path }")
}

# Load species codes
species_code <- foreign::read.dbf(species_path, as.is = TRUE) |> tibble::as_tibble()

# Filter species for birds and prepare columns
species_code <- species_code |>
dplyr::filter(Groupe_FR == "Oiseaux") |> # Only keep birds
dplyr::select(Nom_FR, Nom_Scient, Name_EN, Code4_EN, Code4_FR, Alpha_Code, SousGroupe, STATUT_COS) |>
dplyr::filter(!is.na(SousGroupe)) |>
dplyr::distinct()

# Read metadata and select columns
metadata_sp <- read.csv(metadata_path, encoding = "UTF-8") |>
dplyr::select(Name_SC, Species_ID)

# Add species ID to species codes
species_code <- dplyr::left_join(species_code, metadata_sp, by = c("Nom_Scient" = "Name_SC"))

# Create reference table
species_ref <- species_code |>
dplyr::mutate(
# Create CODE_ID column
CODE_ID = dplyr::case_when(
!is.na(Code4_EN) ~ Code4_EN,
!is.na(Species_ID) ~ Species_ID,
TRUE ~ NA_character_ # Explicitly handle NA cases
),
# Rank species
category = sapply(Nom_Scient, get_species_rank),
Nom_FR = stringr::str_replace(
stringi::stri_trans_general(Nom_FR, "latin-ascii"),
"non identifie|non identife|sp\\.e", "sp."
) |> tolower()
) |>
dplyr::select(CODE_ID, Nom_Scient, STATUT_COS, Nom_FR, Code4_FR, Code4_EN, Alpha_Code, category) |>
janitor::clean_names()

# Optionally drop subspecies
if (isTRUE(drop_subspecies)) {
species_ref <- species_ref |> dplyr::filter(category != "subspecies")
}

cli::cli_alert_info("Load { nrow(species_ref) } species from species reference table")

return(species_ref)
}

#' @title Get species rank
#' @description Get species rank based on scientific name
#'
#' @param scientific_name The scientific name of the species to be ranked
#'
#' @return A character string indicating the rank: "genre", "subspecies", or "species".
#' @examples
#' get_species_rank("Aquila chrysaetos")
#'
#' @export
get_species_rank <- function(scientific_name) {
if (grepl("sp\\.", scientific_name)) {
return("genre")
} else if (length(strsplit(scientific_name, "\\s+")[[1]]) == 3) {
return("subspecies")
} else {
return("species")
}
}
54 changes: 54 additions & 0 deletions R/load_all_datasets.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#' Load and process all datasets
#'
#' This function sequentially calls all the `load_*` dataset processing functions
#' and combines their outputs into a single `data.frame` or a list of `data.frames`.
#'
#' @param combine Logical, if TRUE (default), combines all datasets into a single `data.frame`.
#' If FALSE, returns a named list of individual `data.frames`.
#' @return A combined `data.frame` or a list of `data.frames` containing all processed datasets.
#' @examples
#' \dontrun{
#' all_data <- load_all_datasets()
#' }
#' @export
load_all_datasets <- function(combine = TRUE) {

# List of dataset functions
dataset_functions <- list(
canards_de_mer = load_canards,
eider_hiver = load_eider_hiver,
garrot = load_garrot,
macreuse = load_macreuse,
oies = load_oies,
sauvagine_fleuve = load_sauvagine_fleuve,
biomq = load_biomq
)

cli::cli_h1("Loading all datasets")

# Initialize an empty list to store results
datasets <- list()

# Call each function and store its result
for (name in names(dataset_functions)) {
tryCatch(
{
datasets[[name]] <- dataset_functions[[name]]()
},
error = function(e) {
cli::cli_alert_danger("Failed to load dataset: {name}. Error: {e$message}")
}
)
}

# Combine datasets if requested
if (combine) {
cli::cli_h1("Combining datasets")
combined_data <- dplyr::bind_rows(datasets, .id = "dataset")
cli::cli_alert_success("Successfully combined datasets")
return(combined_data)
}

cli::cli_alert_success("Returning datasets as a list")
return(datasets)
}
113 changes: 113 additions & 0 deletions R/load_biomq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
#' Load and process the BIOMQ dataset
#'
#' This function loads and processes the "BIOMQ" dataset from a predefined external Excel file.
#' It validates the file and columns, applies transformations, and integrates species codes using a reference table.
#'
#' @return A processed `data.frame` with standardized columns and integrated species codes.
#' @examples
#' \dontrun{
#' processed_biomq <- load_biomq()
#' }
#' @export
load_biomq <- function() {

cli::cli_h1("BIOMQ")
cli::cli_alert_info("Starting integration procedure on {external_files$biomq$path}")

# Assert file exists
if (!file.exists(external_files$biomq$path)) {
cli::cli_abort("Could not find file: {external_files$biomq$path}")
}

# Load data from the second sheet
biomq <- readxl::read_excel(path = external_files$biomq$path, sheet = 2) |> tibble::as_tibble()

# Assert columns exist
missing_cols <- setdiff(external_files$biomq$check_columns, names(biomq))
if (length(missing_cols) > 0) {
cli::cli_abort(c(
"Missing required columns in dataset:",
paste(missing_cols, collapse = ", ")
))
}

cli::cli_alert_info("Applying transformation on {nrow(biomq)} rows")

# Select and rename columns
biomq <- biomq |>
dplyr::select(
NomCol, CentroideX, CentroideY, NomFR,
nb_nicheur, methode, nomRef, AnneeDebut,
MoisDebut, JourDebut
) |>
dplyr::rename(
locality = NomCol,
longitude = CentroideX,
latitude = CentroideY,
abondance = nb_nicheur,
inv_type = methode,
obs = nomRef,
year = AnneeDebut,
month = MoisDebut,
day = JourDebut
) |>
dplyr::mutate(
date = lubridate::make_date(year, month, day),
source = "BIOMQ",
nom_fr = tolower(NomFR),
link = external_files$biomq$path,
colony = TRUE
)

# Enforce sp.
biomq <- biomq |>
dplyr::mutate(
nom_fr = stringr::str_replace_all(
stringi::stri_trans_general(nom_fr, "latin-ascii") |> tolower(),
c(
"goelands" = "goeland sp.",
"sternes" = "sterne sp.",
"cormorans" = "cormoran sp."
))
)

# Join TAXO - Match CODE_ID using Nom_FR
biomq <- biomq |>
dplyr::left_join(
dplyr::select(get_species_codes(), code_id, nom_fr) |>
dplyr::mutate(nom_fr = tolower(nom_fr)) |>
dplyr::distinct(),
by = "nom_fr",
na_matches = "never"
) |>
dplyr::mutate(
code_id = ifelse(
nom_fr %in% names(equivalences),
equivalences[nom_fr],
code_id
)
)

# Drop non-relevant coordinates
biomq <- biomq |>
dplyr::filter(
longitude >= -100 & longitude <= -30,
latitude >= 30 & latitude <= 70
)

# Adjust CODE_ID using equivalences_minuscule
biomq <- biomq |>
dplyr::mutate(
code_id = ifelse(
nom_fr %in% names(equivalences),
equivalences[nom_fr],
code_id
)
)

# Re-order cols
biomq <- dplyr::select(biomq, dplyr::all_of(final_cols))

cli::cli_alert_success("Returning {nrow(biomq)} rows")
return(biomq)
}
81 changes: 81 additions & 0 deletions R/load_canards.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@

#' Load and process the Canards de Mer canards
#'
#' This function loads and processes the "Canards de Mer" canards from a predefined external source.
#' It standardizes column names, performs transformations, and integrates species codes using a reference table.
#'
#' @return A processed `data.frame` with standardized columns and integrated species codes.
#' @examples
#' \dontrun{
#' processed_canards <- load_canards()
#' }
#' @export
load_canards <- function() {

cli::cli_h2("Canards de mer")
cli::cli_alert_info("Starting integration procedure on { external_files$canards_de_mer$path }")

# assert file exists
if(!file.exists(external_files$canards_de_mer$path)) {
cli::cli_abort("Could not find file { external_files$canards_de_mer$path }")
}

# Read file
canards <- read.csv2(external_files$canards_de_mer$path) |> tibble::as_tibble()

# assert columns exist
missing_cols <- setdiff(external_files$canards_de_mer$check_columns, names(canards))
if (length(missing_cols) > 0) {
cli::cli_abort(c(
"Missing required columns in dataset:",
paste(missing_cols, collapse = ", ")
))
}

cli::cli_alert_info("Applying transformation on { nrow(canards) } rows")

canards <- canards |>
dplyr::mutate(date_obs = lubridate::make_date(Annee, Mois, Jour)) |>
dplyr::select(
latitude = "LATITUDE",
longitude = "LONGITUDE",
locality = "NomLieu",
date = "date_obs",
abondance = "NombreTotal",
obs = "NomObservateur",
inv_type = "Signification.1",
nom_fr = "Nom_FR"
) |>
dplyr::mutate(
latitude = as.numeric(latitude),
longitude = as.numeric(longitude),
nom_fr = stringi::stri_trans_general(gsub("sp", "sp\\.", nom_fr, ignore.case = TRUE), "latin-ascii") |>
tolower(),
source = "Canards de mer",
colony = FALSE,
link = external_files$canards_de_mer$path
)

# Join TAXO - Match code_id using nom_fr
canards <- canards |>
dplyr::left_join(
dplyr::select(get_species_codes(), code_id, nom_fr) |> dplyr::distinct(),
by = "nom_fr",
na_matches = "never"
) |>
dplyr::mutate(
code_id = ifelse(
nom_fr %in% names(equivalences),
equivalences[nom_fr],
code_id
)
)

# Re-order cols
canards <- dplyr::select(canards, dplyr::all_of(final_cols))

cli::cli_alert_success("Returning { nrow(canards) } rows")

return(canards)

}
Loading

0 comments on commit 7b20688

Please sign in to comment.