From 1c0e05ad86ca70dc1d1768303aeb77bf8860f638 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:15:37 -0700 Subject: [PATCH 1/7] remove unused packages from Imports --- DESCRIPTION | 3 --- 1 file changed, 3 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index eeb2383..18a557d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -22,13 +22,11 @@ Remotes: nationalparkservice/DPchecker, nationalparkservice/QCkit Imports: - EML, dplyr, httr, XML, curl, tools, - rlang, readr, magrittr, crayon, @@ -37,7 +35,6 @@ Imports: QCkit (>= 0.1.4), here, jsonlite, - cli, purrr, tibble, lifecycle From a74e48ac3c5425627602ccdde15f8a68eafedb81 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:15:55 -0700 Subject: [PATCH 2/7] add updates about removing old/dead/bad functions --- NEWS.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS.md b/NEWS.md index 4cc1136..c307506 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # NPSutils 0.3.3 (under development) +##2024-12-19 + * remove `validate_data_package()` as this function was listed as "still under construction" is mostly obsolete given other functions and functions in the DPchecker package. + * remove `load_domains()` as this function was not working properly and was conceived of before the data package specifications were properly set. ## 2024-12-19 * updated `load_pkg_metadata` to be simpler and essentially call `DPchecker::load_metadata` but with a preset default directory structure that works well with the default settings for `get_data_package`. * Add meta-analysis functions for finding and producing summary statistics multiple data packages including `get_ref_list`, `get_refs_info()`, and `summarize_packages`. From 83be8407768e90bbf0cde95f054bcd67acc9669a Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:16:17 -0700 Subject: [PATCH 3/7] delete files with old/dead/bad and or poorly concieved functions. --- R/load_domains.R | 122 ---------------------------------------- R/validateDataPackage.R | 24 -------- 2 files changed, 146 deletions(-) delete mode 100644 R/load_domains.R delete mode 100644 R/validateDataPackage.R diff --git a/R/load_domains.R b/R/load_domains.R deleted file mode 100644 index 0e28d82..0000000 --- a/R/load_domains.R +++ /dev/null @@ -1,122 +0,0 @@ -#' Read contents of data package file and construct a data frame based on the metadata file summarizing the enumerated domains for -#' categorical fields and their types/definitions. -#' -#' @description \code{load_domains} reads the data file from a package and loads a list of fields and their attributes into a data frame. -#' -#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file. -#' @param data_format is a character value indicating the format of the data set(s) within the data package. Currently -#' allowable options are: -#' * "csv" for comma separated value text files -#' * "gdb" for file geodatabases -#' @param metadata_format is a character value indicating the format of the metadata file within the data package. -#' Currently allowable options are: -#' * "eml" for eml-compliant xml metadata files -#' * "fgdc" for FGDC-compliant xml metadata files -#' -#' @return one data frame to the global environment. -#' -#' @export -#' -#' @examples -#' \dontrun{ -#' load_domains(2266200, data_format = "gdb", metadata_format = "fgdc") -#' } -load_domains <- function(holding_id, data_format, metadata_format) { - DataPackageDirectory <- paste("data/raw/", holding_id, sep = "") - DataPackageFilename <- paste(DataPackageDirectory, "/", holding_id, ".zip", sep = "") - - if (data_format == "csv" & metadata_format == "eml") { - fileList <- utils::unzip(DataPackageFilename, list = TRUE) - - csvfile <- subset(fileList, grepl(".csv", Name)) - emlfile <- subset(fileList, grepl(".xml", Name)) - csvFilename <- paste(DataPackageDirectory, "/", csvfile[1], sep = "") - emlFilename <- paste(DataPackageDirectory, "/", emlfile[1], sep = "") - - eml_file <- EML::read_eml(emlFilename, from = "xml") - attributeList <- EML::get_attributes(eml_file$dataset$dataTable$attributeList) - attributes <- attributeList$attributes - factors <- attributeList$factors - factors <- factors[, c(3, 1, 2)] - - return(factors) - } else if (data_format == "gdb" & metadata_format == "fgdc") { - # Working with the metadata file first... - - xmlfile <- list.files(path = DataPackageDirectory, pattern = ".xml") - xmlFilename <- paste0(DataPackageDirectory, "/", xmlfile) - workingXMLfile <- EML::read_eml(xmlFilename, from = "xml") - - # return the metadata to the workspace as a list. - assign(paste0(holding_id, "_fgdcMetadata"), workingXMLfile, envir = .GlobalEnv) - - # Build attributes table from the xml file - attributes <- data.frame( - id = numeric(), - attribute = character(), - attributeDefinition = character(), - attributeType = character(), - attributeFactors = numeric(), - stringsAsFactors = FALSE - ) - for (i in 1:length(workingXMLfile$ea$detailed$attr)) { - attributes <- rbind( - attributes, - cbind( - id = i, - attribute = workingXMLfile$ea$detailed$attr[[i]]$attrlabl, - attributeDefinition = workingXMLfile$ea$detailed$attr[[i]]$attrdef, - attributeType = workingXMLfile$ea$detailed$attr[[i]]$attrtype, - attributeFactors = length(workingXMLfile$ea$detailed$attr[[i]]$attrdomv) - ) - ) - } - - attributes$id <- as.integer(as.character(attributes$id)) - attributes$attribute <- as.character(attributes$attribute) - attributes$attributeDefinition <- as.character(attributes$attributeDefinition) - attributes$attributeType <- as.character(attributes$attributeType) - attributes$attributeFactors <- as.integer(as.character(attributes$attributeFactors)) - - attributes$columnclass <- "character" - attributes$columnclass <- ifelse(attributes$attributeType == "OID", - "integer", - attributes$columnclass - ) - attributes$columnclass <- ifelse(attributes$attributeType == "Date", - "Date", - attributes$columnclass - ) - attributes$columnclass <- ifelse(attributes$attributeType == "Double", - "numeric", - attributes$columnclass - ) - - # Get the factor definitions for class variables - attributeLevels <- data.frame( - attribute = character(), - factor = character(), - factorDefinition = character(), - stringsAsFactors = FALSE - ) - attributesWithFactors <- subset(attributes, attributeFactors > 1) - for (i in 1:nrow(attributesWithFactors)) { - for (j in 1:attributesWithFactors[i, 5]) { - attributeLevels <- rbind(attributeLevels, cbind( - attribute = attributesWithFactors[i, 2], - factor = workingXMLfile$ea$detailed$attr[[attributesWithFactors[i, 1]]]$attrdomv[[j]]$edom$edomv, - factorDefinition = workingXMLfile$ea$detailed$attr[[attributesWithFactors[i, 1]]]$attrdomv[[j]]$edom$edomvd - )) - } - } - - attributeLevels$attribute <- as.character(attributeLevels$attribute) - attributeLevels$factor <- as.character(attributeLevels$factor) - attributeLevels$factorDefinition <- as.character(attributeLevels$factorDefinition) - - # return the enumerated domain table to the workspace as a list and data frame respectively - return(attributeLevels) - } else { - print("data/metadata format combination not supported") - } -} diff --git a/R/validateDataPackage.R b/R/validateDataPackage.R deleted file mode 100644 index ae0fac3..0000000 --- a/R/validateDataPackage.R +++ /dev/null @@ -1,24 +0,0 @@ -#' Validate a Tabular Data Package -#' -#' @description \code{validate_data_package} checks a zipped data package for required files and structure. This function is still under construction. -#' -#' @param file is a path to a zipped data package -#' -#' @export -#' -#' @examples -#' \dontrun{ -#' validate_data_package(File = "C:/test/test_package.zip") -#' } -validate_data_package <- function(file) { - #### It seems like the DPchecker package now performs this function? - #### Is this function really necessary anymore? - - # look for a manifest - # compare manifest to zip contents - # look for metadata file - only one - # look for data files - provide a count - # look for a readme - optional - # look for additional files - rlang::inform("This function is still under construction") -} From ceb1f931b29df391d23a00b6f170f7895bde1760 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:16:31 -0700 Subject: [PATCH 4/7] add importFrom stats na.omit --- R/map_wkt.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/map_wkt.R b/R/map_wkt.R index 36e4315..a00d670 100644 --- a/R/map_wkt.R +++ b/R/map_wkt.R @@ -11,6 +11,7 @@ #' #' @return The function returns a dynamic, zoomable leaflet map with the specific geometry plotted. #' @importFrom magrittr %>% +#' @importFrom stats na.omit #' @export #' #' @examples From 5e287b0a40b38349834a5a74509a78d2a0ac812a Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:16:59 -0700 Subject: [PATCH 5/7] update documentation to be more specific about what happens if you try to download restricted references without proper authentication/permissions. --- R/meta_analyses.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/meta_analyses.R b/R/meta_analyses.R index 2cf901e..c233e35 100644 --- a/R/meta_analyses.R +++ b/R/meta_analyses.R @@ -138,7 +138,7 @@ get_refs_info <- function (reference_type = "dataPackage", #' #' If a data package fails to download (or load) into R, the function will return NAs instead of summary data about the data package as well as a message about the package status ("Loads", "Error") in the dataframe that the function returns. The function will ignore files that fall outside the data package specifications (one or more .csv files and a single .xml file ending in *_metadata.xml). #' -#' When `check_metadata` is set to the default `FALSE`, the function will attempt to and load any .csv, regardless of the contents. Data packages with restricted access can produce false positives if you do not have the appropriate permissions to download the data as the function will still download the files, but they will be populated with unhelpful hmtl rather than the intended data. Functions that fail to load into R likely violate the data package specifications in some fundamental way (e.g. .CSV file instead of .csv or no .csv files at all). +#' When `check_metadata` is set to the default `FALSE`, the function will attempt to and load any .csv, regardless of the contents. Data packages with restricted access can produce false positives if you do not have the appropriate permissions to download the data as the function will still download the files, but they will be populated with unhelpful hmtl rather than the intended data. In this case, each .csv will be listed as having 5 columns and one row of data. Functions that completely fail to load into R likely violate the data package specifications in some fundamental way (e.g. .CSV file instead of .csv or no .csv files at all). #' #' When `check_metadata` is set to `TRUE`, additional checks and tests are run on the data package and load errors may occur for all of the above reasons and also if there are multiple .xml files, if the metadata file name does not end in "*_metadata.xml", if there is no metadata file, or if the metadata file is EML schema-invalid. #' From 7885268e4ec705c2b456ca4b5cd3d31c2aa2d3b9 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:17:21 -0700 Subject: [PATCH 6/7] add attributeName to global variables. --- R/utils.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index 3e88b00..9db62d9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -63,5 +63,6 @@ globalVariables(c("capture.output", "metaformat", "data_format", "metadata_format", - "fileList" + "fileList", + "attributeName" )) \ No newline at end of file From 2b63bd7ab2999c527967ab6c6bd16025321aab29 Mon Sep 17 00:00:00 2001 From: Rob Baker Date: Mon, 23 Dec 2024 10:17:55 -0700 Subject: [PATCH 7/7] updated via devtools::document and pkgdown::build_site_github_pages --- NAMESPACE | 3 +- docs/news/index.html | 15 +-- docs/pkgdown.yml | 2 +- docs/reference/index.html | 8 -- docs/reference/load_domains.html | 119 ---------------------- docs/reference/validate_data_package.html | 101 ------------------ docs/sitemap.xml | 2 - man/load_domains.Rd | 33 ------ man/summarize_packages.Rd | 2 +- man/validate_data_package.Rd | 19 ---- 10 files changed, 4 insertions(+), 300 deletions(-) delete mode 100644 docs/reference/load_domains.html delete mode 100644 docs/reference/validate_data_package.html delete mode 100644 man/load_domains.Rd delete mode 100644 man/validate_data_package.Rd diff --git a/NAMESPACE b/NAMESPACE index 1ab94b9..70a8c55 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -20,11 +20,10 @@ export(load_core_metadata) export(load_data_package) export(load_data_package_deprecated) export(load_data_packages) -export(load_domains) export(load_pkg_metadata) export(map_wkt) export(rm_local_packages) export(summarize_packages) -export(validate_data_package) importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") +importFrom(stats,na.omit) diff --git a/docs/news/index.html b/docs/news/index.html index fcef3ea..9f53ec9 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -52,20 +52,7 @@

Changelog

-
-

2024-12-19

-
  • updated load_pkg_metadata to be simpler and essentially call DPchecker::load_metadata but with a preset default directory structure that works well with the default settings for get_data_package.
  • -
  • Add meta-analysis functions for finding and producing summary statistics multiple data packages including get_ref_list, get_refs_info(), and summarize_packages. ## 2024-10-24
  • -
  • fix how get_data_package aliases get_data_packages, specifically now allows users to adjust parameters to non-default settings. ## 2024-10-21
  • -
  • Bug fixes to load_data_package() -
  • -
  • Bug fixes to .get_authors() -
  • -
  • -get_authors now adds a period (.) after given names with a single character and can handle an unlimited number of given names.
  • -
  • Moved sf, leaflet, and stringr to from imports to suggests.
  • -
  • Enable .get_contacts() to handle cases when there is only one contact.
  • -
+

##2024-12-19 * remove validate_data_package() as this function was listed as “still under construction” is mostly obsolete given other functions and functions in the DPchecker package. * remove load_domains() as this function was not working properly and was conceived of before the data package specifications were properly set. ## 2024-12-19 * updated load_pkg_metadata to be simpler and essentially call DPchecker::load_metadata but with a preset default directory structure that works well with the default settings for get_data_package. * Add meta-analysis functions for finding and producing summary statistics multiple data packages including get_ref_list, get_refs_info(), and summarize_packages. ## 2024-10-24 * fix how get_data_package aliases get_data_packages, specifically now allows users to adjust parameters to non-default settings. ## 2024-10-21 * Bug fixes to load_data_package() * Bug fixes to .get_authors() * get_authors now adds a period (.) after given names with a single character and can handle an unlimited number of given names. * Moved sf, leaflet, and stringr to from imports to suggests. * Enable .get_contacts() to handle cases when there is only one contact.

diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index 2d80c3d..2024c90 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -3,4 +3,4 @@ pkgdown: 2.1.0 pkgdown_sha: ~ articles: NPSutils: NPSutils.html -last_built: 2024-12-20T18:46Z +last_built: 2024-12-23T17:12Z diff --git a/docs/reference/index.html b/docs/reference/index.html index 625e151..7568ba5 100644 --- a/docs/reference/index.html +++ b/docs/reference/index.html @@ -125,10 +125,6 @@

All functions load_data_package_deprecated()

Read contents of data package and constructs a list of tibbles based on the data file(s)

- -

load_domains()

- -

Read contents of data package file and construct a data frame based on the metadata file summarizing the enumerated domains for categorical fields and their types/definitions.

load_pkg_metadata()

@@ -145,10 +141,6 @@

All functions summarize_packages()

Collect summary statistics on data packages

- -

validate_data_package()

- -

Validate a Tabular Data Package