From c5c568b18b14c51badd294ae418a11a824261c69 Mon Sep 17 00:00:00 2001
From: Karlo Guidoni <kguidonimartins@gmail.com>
Date: Sun, 11 Dec 2022 19:26:35 -0300
Subject: [PATCH] dataset names must be unique

---
 NEWS.md                                       |  1 +
 R/bdc_standardize_datasets.R                  |  6 ++++-
 .../testthat/test-bdc_standardize_datasets.R  | 25 +++++++++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 0881f57b..95cc87ab 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,6 @@
 # bdc 1.1.3
 
+- `bdc_standardize_datasets` now throws an error when dataset names defined in the metadata file are not unique.
 - Fix minor bug in `bdc_coordinates_country_inconsistent()` (see: 5c4e0aa).
 - `{countrycode}` and `{rangeBuilder}` dependencies were
   removed. Country names now are derived from [Stefan Gabos](https://github.com/stefangabos/world_countries/) repository
diff --git a/R/bdc_standardize_datasets.R b/R/bdc_standardize_datasets.R
index 6eb0ab90..f1f6c4f5 100644
--- a/R/bdc_standardize_datasets.R
+++ b/R/bdc_standardize_datasets.R
@@ -105,6 +105,10 @@ bdc_standardize_datasets <-
         metadata %>%
         dplyr::pull(fileName)
 
+      if (length(unique(metadata$datasetName)) != nrow(metadata)) {
+        stop("[ERROR]: Dataset names defined in the `datasetName` column must be unique.")
+      }
+
       for (file_index in seq_along(input_file)) {
         input_filename <-
           metadata %>%
@@ -258,7 +262,7 @@ bdc_standardize_datasets <-
           # here::here("data", "temp_datasets") %>%
           save_in_dir %>%
           fs::dir_ls(regexp = "*.qs") %>%
-          purrr::map_dfr(~ qs::qread(.x) %>% 
+          purrr::map_dfr(~ qs::qread(.x) %>%
                          dplyr::mutate(dplyr::across(
                            .cols = dplyr::everything(), ~ as.character(.x)
                          )))
diff --git a/tests/testthat/test-bdc_standardize_datasets.R b/tests/testthat/test-bdc_standardize_datasets.R
index 969a69de..63fed5e9 100644
--- a/tests/testthat/test-bdc_standardize_datasets.R
+++ b/tests/testthat/test-bdc_standardize_datasets.R
@@ -81,6 +81,13 @@ wrong_metadata <- tibble::tribble(
   "datafake4", df4_path, NA, "nome_das_especies", "y", "x", NA, "notes"
 )
 
+metadata_repeated_datasetName <- tibble::tribble(
+  ~datasetName, ~fileName, ~occurrenceID, ~scientificName, ~decimalLatitude, ~decimalLongitude,
+  "datafake1", df1_path, "id", "species", "latitude", "longitude",
+  "datafake2", df2_path, "id_number", "spp", "lat", "lon",
+  "datafake1", df1_path, "id", "species", "latitude", "longitude"
+)
+
 bdc_standardize_datasets(metadata = metadata, overwrite = TRUE, format = "qs", save_database = FALSE)
 
 test_that("bdc_standardize_datasets can create qs files", {
@@ -192,3 +199,21 @@ test_that("bdc_standardize_datasets can create 00_merged_datasets.qs", {
 
   unlink(here::here("Output"), recursive = TRUE)
 })
+
+test_that("bdc_standardize_datasets throw an error when dataset names are not unique", {
+
+  res <-
+    capture_message(
+      bdc_standardize_datasets(
+        metadata = metadata_repeated_datasetName,
+        overwrite = TRUE,
+        format = "csv",
+        save_database = FALSE
+      )
+    )
+
+  expect_equal(res$message, "[ERROR]: Dataset names defined in the `datasetName` column must be unique.")
+
+  unlink(here::here("Output"), recursive = TRUE)
+
+})