Skip to content
This repository has been archived by the owner on Sep 9, 2022. It is now read-only.

Commit

Permalink
#19 started work for colletor names helpers - not done yet
Browse files Browse the repository at this point in the history
using tibble::as_data_frame for dframe now #21 still need to problably just use by deaflut everywhere
  • Loading branch information
sckott committed Jan 7, 2017
1 parent 795c605 commit af72c6d
Show file tree
Hide file tree
Showing 10 changed files with 227 additions and 42 deletions.
7 changes: 5 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ Description: Clean biological occurrence records. Includes functionality
for cleaning based on various aspects of spatial coordinates,
unlikely values due to political 'centroids', coordinates based on
where collections of specimens are held, and more.
Version: 0.1.1.9400
Version: 0.1.2.9115
Authors@R: c(person("Scott", "Chamberlain", role = c("aut", "cre"),
email = "[email protected]"))
License: MIT + file LICENSE
Expand All @@ -20,7 +20,10 @@ Imports:
Matrix,
magrittr,
qlcMatrix,
lazyeval
lazyeval,
crul,
jsonlite,
tibble
Suggests:
testthat,
knitr,
Expand Down
43 changes: 4 additions & 39 deletions R/clean_df.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,45 +16,10 @@ dframe.default <- function(x) {
}

#' @export
dframe.data.frame <- function(x) as_data_frame(x)
dframe.data.frame <- function(x) {
tibble::as_data_frame(x)
# as_data_frame(x)
}

#' @export
dframe.dframe <- function(x) x

as_data_frame <- function(x) {
stopifnot(is.list(x))
if (length(x) == 0) {
x <- list()
class(x) <- c("tbl_df", "tbl", "data.frame")
attr(x, "row.names") <- .set_row_names(0)
return(x)
}
names_x <- names2(x)
if (any(is.na(names_x) | names_x == "")) {
stop("All columns must be named", call. = FALSE)
}
ok <- vapply(x, is_1d, logical(1))
if (any(!ok)) {
stop("data_frames can only contain 1d atomic vectors and lists",
call. = FALSE)
}
n <- unique(vapply(x, NROW, integer(1)))
if (length(n) != 1) {
stop("Columns are not all same length", call. = FALSE)
}
class(x) <- c("dframe", "tbl_df", "tbl", "data.frame")
attr(x, "row.names") <- .set_row_names(n)
return(x)
}

names2 <- function(x) {
names(x) %||% rep.int("", length(x))
}

is_1d <- function(x) {
((is.atomic(x) && !is.null(x)) || is.list(x)) && length(dim(x)) <= 1
}

`%||%` <- function(x, y) {
if (is.null(x)) y else x
}
41 changes: 41 additions & 0 deletions R/collector-funs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#' Collector based cleaning
#'
#' @name collectors
#' @keywords internal
#' @param x (data.frame) A data.frame
#' @param collector (character) Collector field to use. See Details.
#' @param drop (logical) Drop bad data points or not. Either way, we parse
#' out bade data points as an attribute you can access. Default: \code{TRUE}
#'
#' @return Returns a data.frame, with attributes
#'
#' @details
#' Explanation of the functions:
#'
#' \itemize{
#' \item coll_clean - Standardize collector names
#' }
#'
#' @examples
#' df <- data.frame(
#' coll = c('K.F.P. Martius', 'C. F. P. Martius', 'C. F. P. von Martius'),
#' species = 'Poa annua',
#' lat = 1:3,
#' lon = 4:6,
#' stringsAsFactors = FALSE
#' )
#'
#' # Standardize names
#' NROW(df)
#' df <- dframe(df) %>% coll_clean()
#' NROW(df)
#' attr(df, "coll_clean")

#' @rdname collectors
coll_clean <- function(x, collector = NULL) {
x <- do_collectors(x, collector)
x <- stand_collectors(x)
if (NROW(x) == 0) x <- NA
row.names(x) <- NULL
structure(reassign(x), coll_clean = incomp)
}
8 changes: 8 additions & 0 deletions R/do_collectors.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# assign consitent collector variable to be able to act on data inputs
# more easily - and save original column names to rename data on return

do_collectors <- function(x, collector) {
x <- guess_collector(x, collector)
if (is.null(attr(x, "coll_var_orig"))) attr(x, "coll_var_orig") <- collector
return(x)
}
29 changes: 29 additions & 0 deletions R/guess_collector.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
guess_collector <- function(x, collector = NULL) {
nms <- names(x)

if (!is.null(attr(x, "coll_var_orig"))) collector <- attr(x, "coll_var_orig")

if (is.null(collector)) {
colls <- nms[grep(sprintf("^(%s)$", paste0(coll_options, collapse = "|")),
nms, ignore.case = TRUE)]

if (length(colls) == 1) {
if (length(nms) > 2) {
message("Assuming '", colls, "' is collector")
}
names(x)[names(x) %in% colls] <- coll_var <- "collector"
} else {
stop("Couldn't infer collector column, please specify w/ 'collector' parameter",
call. = FALSE)
}
} else {
if (!any(names(x) %in% collector)) {
stop("'", collector, "' not found in your data", call. = FALSE)
}
names(x)[names(x) %in% collector] <- coll_var <- "collector"
}

structure(x, coll_var = coll_var)
}

coll_options <- c("recordedBy", "collector", "coll")
58 changes: 58 additions & 0 deletions R/harvard-botanists.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#' Harvard botanist index functions
#'
#' @keywords internal
#' @examples \dontrun{
#' bot_search(name = "Asa Gray")
#' bot_search(name = "A. Gray")
#' bot_search(remarks = "harvard")
#' bot_search(name = "Gray", fuzzy = TRUE)
#' bot_search()
#' bot_search()
#'
#' ## FIXME - this leads to a JSON parsing error because they give
#' ## bad JSON in some results, including this example
#' # bot_search(country = "China")
#' }
bot_search <- function(name = NULL, individual = FALSE, start = NULL,
fuzzy = FALSE, remarks = NULL, speciality = NULL, country = NULL,
is_collector = FALSE, is_author = FALSE, team = FALSE,
error = stop, ...) {

cli <- crul::HttpClient$new(url = hbi_base())
args <- ct(list(
name = name, json = "y", individual = logt(individual), start = start,
soundslike = if (fuzzy) "true" else NULL, remarks = remarks,
speciality = speciality, country = country, is_collector = logt(is_collector),
is_author = logt(is_author), team = logt(team)
))
res <- cli$get(query = args, ...)
res$raise_for_status()
if ((err <- grepl("no matching result", res$parse("UTF-8"), ignore.case = TRUE))) {
error("(404) no matching results found", call. = FALSE)
}
if (err && as.character(substitute(error)) != "stop") return(NULL)
tibble::as_data_frame(
jsonlite::fromJSON(res$parse("UTF-8"))$botanists
)
}

hbi_base <- function() 'http://kiki.huh.harvard.edu/databases/botanist_search.php'

logt <- function(x) if (x) "on" else NULL

clean_dirty_json <- function(x) {
tmp <- gregexpr("\"\"[A-Za-z0-9]+\"\"", x)[[1]]
if (tmp == -1) {
x
} else {
substring(x, tmp, (tmp + attr(tmp, "match.length")) - 1)
}
}

# http://kiki.huh.harvard.edu/databases/botanist_search.php?name=Asa+Gray&individual=on&json=y
# http://kiki.huh.harvard.edu/databases/botanist_search.php?start=1&name=Gray&id=&soundslike=true&remarks=&specialty=&country=&is_collector=on

stand_collectors <- function(x) {
nms <- unique(x$collector)
res <- ct(lapply(nms, bot_search))
}
4 changes: 4 additions & 0 deletions R/zzz.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,7 @@ check4pkg <- function(x) {
}

ct <- function(l) Filter(Negate(is.null), l)

`%||%` <- function(x, y) {
if (is.null(x)) y else x
}
2 changes: 1 addition & 1 deletion inst/extdata/herbaria.csv
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ latitude,longitude,name
48.205212,16.359399,Naturhistorisches Museum Wien
59.368999,18.053043,Swedish Museum of Natural History (Naturhistoriska riksmuseet)
52.165119,4.475357,National Herbarium of the Netherlands (Nationaal Herbarium Nederland)
43.631944,3.863889,Universite Montpellier
43.631944,3.863889,Universite Montp
45.780837,4.867900,Universite Claude Bernard
47.358727,8.559751,Joint Herbarium of the University of Zurich and the ETH Zurich
50.928664,4.326260,National Botanic Garden of Belgium
Expand Down
30 changes: 30 additions & 0 deletions man/bot_search.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

47 changes: 47 additions & 0 deletions man/collectors.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit af72c6d

Please sign in to comment.