Merge pull request #2 from inSilecoInc/CST-442-refactor

CST-442 refactor ETL functions for all urgenceAviR datasets
inSilecoInc · Jan 19, 2025 · 7b20688 · 7b20688
2 parents 44941d2 + 1011b2d
commit 7b20688
Show file tree

Hide file tree

Showing 26 changed files with 1,293 additions and 5 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,16 +1,26 @@
 Package: urgenceAviR
-Title: What the Package Does (One Line, Title Case)
-Version: 0.0.0.9000
+Title: Integrate birds abundances data 
+Version: 0.0.1
 Authors@R: c( 
         person("Clara", "Casabona", role = "ctb"),
         person("Matthieu", "Beaumont", email = "[email protected]", role = c("aut", "cre", "cph")),
         person("Steve", "Vissault", email = "[email protected]", role = "aut")
     )
-Description: What the package does (one paragraph).
+Description: A package designed to gather bird surveys and integrate them. The newly created database is used to assess environmental threat levels. It includes tools for loading, 
+    processing, and analyzing spatial datasets, as well as integration with 
+    various taxonomic reference systems.
 Imports:
+    cli,
+    dplyr,
+    foreign,
     glue,
-    terra,
-    sf
+    janitor,
+    lubridate,
+    readxl,
+    stringr,
+    stringi,
+    tibble,
+    terra
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,3 +1,13 @@
 # Generated by roxygen2: do not edit by hand
 
 export(get_ebird)
+export(get_species_rank)
+export(load_all_datasets)
+export(load_biomq)
+export(load_canards)
+export(load_eider_hiver)
+export(load_garrot)
+export(load_macreuse)
+export(load_oies)
+export(load_sauvagine_fleuve)
+export(load_somec)
diff --git a/R/get_species_codes.R b/R/get_species_codes.R
@@ -0,0 +1,97 @@
+#' Clean and Process Species Codes
+#'
+#' This file contains functions to clean and process species codes from a DBF file,
+#' join them with metadata, and optionally drop subspecies from the output.
+#'
+#' @title Clean and Process Species Codes
+#' @description Functions to clean and process species codes from a DBF file, join them with metadata, and optionally drop subspecies.
+#'
+#' @param species_path Path to the DBF file containing species codes.
+#' @param metadata_path Path to the CSV file containing metadata.
+#' @param drop_subspecies Logical, whether to drop subspecies from the resulting data frame (default: TRUE).
+#'
+#' @return A cleaned data frame of species codes with optional subspecies filtered out.
+#' @examples
+#' \dontrun{
+#' species_ref <- get_species_codes(
+#'  species_path = "data/CodesEspeces.dbf", 
+#'  metadata_path = "data/metadata_species.csv", 
+#'  drop_subspecies = TRUE
+#' )
+#' }
+get_species_codes <- function(species_path = external_files$species_codes$path, metadata_path = external_files$species_metadata$path, drop_subspecies = TRUE) {
+
+  # assert file exists
+  if (!file.exists(species_path)) {
+    cli::cli_abort("Could not find file { species_path }")
+  }
+
+  if(!file.exists(metadata_path)) {
+      cli::cli_abort("Could not find file { metadata_path }")
+  }
+
+  # Load species codes
+  species_code <- foreign::read.dbf(species_path, as.is = TRUE) |> tibble::as_tibble()
+
+  # Filter species for birds and prepare columns
+  species_code <- species_code |>
+    dplyr::filter(Groupe_FR == "Oiseaux") |>  # Only keep birds
+    dplyr::select(Nom_FR, Nom_Scient, Name_EN, Code4_EN, Code4_FR, Alpha_Code, SousGroupe, STATUT_COS) |>
+    dplyr::filter(!is.na(SousGroupe)) |>
+    dplyr::distinct()
+
+  # Read metadata and select columns
+  metadata_sp <- read.csv(metadata_path, encoding = "UTF-8") |>
+    dplyr::select(Name_SC, Species_ID)
+
+  # Add species ID to species codes
+  species_code <- dplyr::left_join(species_code, metadata_sp, by = c("Nom_Scient" = "Name_SC"))
+
+  # Create reference table
+  species_ref <- species_code |>
+    dplyr::mutate(
+      # Create CODE_ID column
+      CODE_ID = dplyr::case_when(
+        !is.na(Code4_EN) ~ Code4_EN,
+        !is.na(Species_ID) ~ Species_ID,
+        TRUE ~ NA_character_ # Explicitly handle NA cases
+      ),
+      # Rank species
+      category = sapply(Nom_Scient, get_species_rank),
+      Nom_FR = stringr::str_replace(
+        stringi::stri_trans_general(Nom_FR, "latin-ascii"),
+        "non identifie|non identife|sp\\.e", "sp."
+      ) |> tolower()
+    ) |>
+    dplyr::select(CODE_ID, Nom_Scient, STATUT_COS, Nom_FR, Code4_FR, Code4_EN, Alpha_Code, category) |>
+    janitor::clean_names()
+
+  # Optionally drop subspecies
+  if (isTRUE(drop_subspecies)) {
+    species_ref <- species_ref |> dplyr::filter(category != "subspecies")
+  }
+
+  cli::cli_alert_info("Load { nrow(species_ref) } species from species reference table")
+
+  return(species_ref)
+}
+
+#' @title Get species rank
+#' @description Get species rank based on scientific name
+#'
+#' @param scientific_name The scientific name of the species to be ranked
+#'
+#' @return A character string indicating the rank: "genre", "subspecies", or "species".
+#' @examples
+#' get_species_rank("Aquila chrysaetos")
+#' 
+#' @export
+get_species_rank <- function(scientific_name) {
+  if (grepl("sp\\.", scientific_name)) {
+    return("genre")
+  } else if (length(strsplit(scientific_name, "\\s+")[[1]]) == 3) {
+    return("subspecies")
+  } else {
+    return("species")
+  }
+}
diff --git a/R/load_all_datasets.R b/R/load_all_datasets.R
@@ -0,0 +1,54 @@
+#' Load and process all datasets
+#'
+#' This function sequentially calls all the `load_*` dataset processing functions
+#' and combines their outputs into a single `data.frame` or a list of `data.frames`.
+#'
+#' @param combine Logical, if TRUE (default), combines all datasets into a single `data.frame`.
+#'                If FALSE, returns a named list of individual `data.frames`.
+#' @return A combined `data.frame` or a list of `data.frames` containing all processed datasets.
+#' @examples
+#' \dontrun{
+#' all_data <- load_all_datasets()
+#' }
+#' @export
+load_all_datasets <- function(combine = TRUE) {
+
+    # List of dataset functions
+    dataset_functions <- list(
+        canards_de_mer = load_canards,
+        eider_hiver = load_eider_hiver,
+        garrot = load_garrot,
+        macreuse = load_macreuse,
+        oies = load_oies,
+        sauvagine_fleuve = load_sauvagine_fleuve,
+        biomq = load_biomq
+    )
+
+    cli::cli_h1("Loading all datasets")
+
+    # Initialize an empty list to store results
+    datasets <- list()
+
+    # Call each function and store its result
+    for (name in names(dataset_functions)) {
+        tryCatch(
+            {
+                datasets[[name]] <- dataset_functions[[name]]()
+            },
+            error = function(e) {
+                cli::cli_alert_danger("Failed to load dataset: {name}. Error: {e$message}")
+            }
+        )
+    }
+
+    # Combine datasets if requested
+    if (combine) {
+        cli::cli_h1("Combining datasets")
+        combined_data <- dplyr::bind_rows(datasets, .id = "dataset")
+        cli::cli_alert_success("Successfully combined datasets")
+        return(combined_data)
+    }
+
+    cli::cli_alert_success("Returning datasets as a list")
+    return(datasets)
+}
diff --git a/R/load_biomq.R b/R/load_biomq.R
@@ -0,0 +1,113 @@
+#' Load and process the BIOMQ dataset
+#'
+#' This function loads and processes the "BIOMQ" dataset from a predefined external Excel file.
+#' It validates the file and columns, applies transformations, and integrates species codes using a reference table.
+#'
+#' @return A processed `data.frame` with standardized columns and integrated species codes.
+#' @examples
+#' \dontrun{
+#' processed_biomq <- load_biomq()
+#' }
+#' @export
+load_biomq <- function() {
+
+    cli::cli_h1("BIOMQ")
+    cli::cli_alert_info("Starting integration procedure on {external_files$biomq$path}")
+
+    # Assert file exists
+    if (!file.exists(external_files$biomq$path)) {
+        cli::cli_abort("Could not find file: {external_files$biomq$path}")
+    }
+
+    # Load data from the second sheet
+    biomq <- readxl::read_excel(path = external_files$biomq$path, sheet = 2) |> tibble::as_tibble()
+
+    # Assert columns exist
+    missing_cols <- setdiff(external_files$biomq$check_columns, names(biomq))
+    if (length(missing_cols) > 0) {
+        cli::cli_abort(c(
+            "Missing required columns in dataset:",
+            paste(missing_cols, collapse = ", ")
+        ))
+    }
+
+    cli::cli_alert_info("Applying transformation on {nrow(biomq)} rows")
+
+    # Select and rename columns
+    biomq <- biomq |>
+        dplyr::select(
+            NomCol, CentroideX, CentroideY, NomFR,
+            nb_nicheur, methode, nomRef, AnneeDebut,
+            MoisDebut, JourDebut
+        ) |>
+        dplyr::rename(
+            locality = NomCol,
+            longitude = CentroideX,
+            latitude = CentroideY,
+            abondance = nb_nicheur,
+            inv_type = methode,
+            obs = nomRef,
+            year = AnneeDebut,
+            month = MoisDebut,
+            day = JourDebut
+        ) |>
+        dplyr::mutate(
+            date = lubridate::make_date(year, month, day),
+            source = "BIOMQ",
+            nom_fr = tolower(NomFR),
+            link = external_files$biomq$path,
+            colony = TRUE
+        )
+
+    # Enforce sp. 
+    biomq <- biomq |>
+        dplyr::mutate(
+            nom_fr = stringr::str_replace_all(
+             stringi::stri_trans_general(nom_fr, "latin-ascii") |> tolower(), 
+                c(
+                    "goelands" = "goeland sp.",
+                    "sternes" = "sterne sp.",
+                    "cormorans" = "cormoran sp."
+                ))
+        )
+
+    # Join TAXO - Match CODE_ID using Nom_FR
+    biomq <- biomq |>
+        dplyr::left_join(
+            dplyr::select(get_species_codes(), code_id, nom_fr) |> 
+            dplyr::mutate(nom_fr = tolower(nom_fr)) |>
+            dplyr::distinct(),
+            by = "nom_fr",
+            na_matches = "never"
+        ) |>
+        dplyr::mutate(
+            code_id = ifelse(
+                nom_fr %in% names(equivalences),
+                equivalences[nom_fr],
+                code_id
+            )
+        )
+
+    # Drop non-relevant coordinates
+    biomq <- biomq |>
+        dplyr::filter(
+            longitude >= -100 & longitude <= -30,
+            latitude >= 30 & latitude <= 70
+        )
+
+    # Adjust CODE_ID using equivalences_minuscule
+    biomq <- biomq |>
+        dplyr::mutate(
+            code_id = ifelse(
+                nom_fr %in% names(equivalences),
+                equivalences[nom_fr],
+                code_id
+            )
+        )
+
+    # Re-order cols
+    biomq <- dplyr::select(biomq, dplyr::all_of(final_cols))
+
+    cli::cli_alert_success("Returning {nrow(biomq)} rows")
+    return(biomq)
+}
diff --git a/R/load_canards.R b/R/load_canards.R
@@ -0,0 +1,81 @@
+
+#' Load and process the Canards de Mer canards
+#'
+#' This function loads and processes the "Canards de Mer" canards from a predefined external source.
+#' It standardizes column names, performs transformations, and integrates species codes using a reference table.
+#'
+#' @return A processed `data.frame` with standardized columns and integrated species codes.
+#' @examples
+#' \dontrun{
+#' processed_canards <- load_canards()
+#' }
+#' @export
+load_canards <- function() {
+
+    cli::cli_h2("Canards de mer")
+    cli::cli_alert_info("Starting integration procedure on { external_files$canards_de_mer$path }")
+
+    # assert file exists
+    if(!file.exists(external_files$canards_de_mer$path)) {
+        cli::cli_abort("Could not find file { external_files$canards_de_mer$path }")
+    }
+
+    # Read file
+    canards <- read.csv2(external_files$canards_de_mer$path) |> tibble::as_tibble()
+
+    # assert columns exist
+    missing_cols <- setdiff(external_files$canards_de_mer$check_columns, names(canards))
+    if (length(missing_cols) > 0) {
+        cli::cli_abort(c(
+            "Missing required columns in dataset:",
+            paste(missing_cols, collapse = ", ")
+        ))
+    }
+
+    cli::cli_alert_info("Applying transformation on { nrow(canards) } rows")
+
+    canards <- canards |>
+        dplyr::mutate(date_obs = lubridate::make_date(Annee, Mois, Jour)) |>
+        dplyr::select(
+            latitude = "LATITUDE",
+            longitude = "LONGITUDE",
+            locality = "NomLieu",
+            date = "date_obs",
+            abondance = "NombreTotal",
+            obs = "NomObservateur",
+            inv_type = "Signification.1",
+            nom_fr = "Nom_FR"
+        ) |>
+        dplyr::mutate(
+            latitude = as.numeric(latitude),
+            longitude = as.numeric(longitude),
+            nom_fr = stringi::stri_trans_general(gsub("sp", "sp\\.", nom_fr, ignore.case = TRUE), "latin-ascii") |> 
+                tolower(),
+            source = "Canards de mer",
+            colony = FALSE,
+            link = external_files$canards_de_mer$path
+        )
+
+    # Join TAXO - Match code_id using nom_fr
+    canards <- canards |>
+        dplyr::left_join(
+            dplyr::select(get_species_codes(), code_id, nom_fr) |> dplyr::distinct(),
+            by = "nom_fr",
+            na_matches = "never"
+        ) |>
+        dplyr::mutate(
+            code_id = ifelse(
+                nom_fr %in% names(equivalences),
+                equivalences[nom_fr],
+                code_id
+            )
+        )
+
+    # Re-order cols
+    canards <- dplyr::select(canards, dplyr::all_of(final_cols))
+
+    cli::cli_alert_success("Returning { nrow(canards) } rows")
+
+    return(canards)
+
+}