-
Notifications
You must be signed in to change notification settings - Fork 242
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Import TRY citations, and some useful DB functions #1848
Changes from all commits
b8a9780
fe89c99
e1128e3
dbad6dc
004f0fe
1a0c154
cafbdda
d339e1a
72af94e
fde7c43
998d5b3
1e9515b
0f3b9c7
90ac2ba
9510123
ae2000e
717c477
2e88903
433c96e
4007e9a
894dd27
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
^.*\.Rproj$ | ||
^\.Rproj\.user$ | ||
try\.sqlite |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
try.sqlite | ||
inst/import-try/data-proc |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,14 +18,23 @@ Imports: | |
PEcAn.utils, | ||
dbplyr (>= 1.2.0), | ||
dplyr, | ||
tibble, | ||
purrr, | ||
tidyr, | ||
glue, | ||
lubridate, | ||
magrittr, | ||
ncdf4, | ||
plyr (>= 1.8.4), | ||
udunits2 | ||
Suggests: | ||
RPostgreSQL, | ||
testthat (>= 1.0.2) | ||
RSQLite, | ||
testthat (>= 1.0.2), | ||
tidyverse, | ||
data.table, | ||
rcrossref, | ||
here | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch on the In general, I'm including |
||
License: FreeBSD + file LICENSE | ||
Copyright: Authors | ||
LazyLoad: yes | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
#' Merge local data frame into SQL table | ||
#' | ||
#' @inheritParams insert_table | ||
#' @inheritDotParams insert_table | ||
#' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values` | ||
#' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table) | ||
#' @export | ||
#' @examples | ||
#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") | ||
#' dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE) | ||
#' db_merge_into(iris[1:12,], "iris", irisdb) | ||
#' dplyr::tbl(irisdb, "iris") %>% dplyr::count() | ||
db_merge_into <- function(values, table, con, by = NULL, drop = FALSE, ...) { | ||
values_fixed <- match_dbcols(values, table, con, drop = FALSE) | ||
if (is.null(by)) { | ||
by <- match_colnames(values, table, con) | ||
} | ||
sql_tbl <- dplyr::tbl(con, table) | ||
values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE) | ||
if (nrow(values_merge) < 1 || ncol(values_merge) < 1) { | ||
PEcAn.logger::logger.warn( | ||
"Input table for merge is empty." | ||
) | ||
} else { | ||
insert <- insert_table(values_merge, table, con, ...) | ||
} | ||
dplyr::tbl(con, table) %>% | ||
dplyr::inner_join(values_fixed, copy = TRUE) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
#' Insert R data frame into SQL database | ||
#' | ||
#' First, subset to matching columns. Then, make sure the local and SQL column | ||
#' classes match, coercing local to SQL as necessary (or throwing an error). | ||
#' Then, build an SQL string for the insert statement. Finally, insert into the | ||
#' database. | ||
#' | ||
#' @param values `data.frame` of values to write to SQL database | ||
#' @param table Name of target SQL table, as character | ||
#' @param coerce_col_class logical, whether or not to coerce local data columns | ||
#' to SQL classes. Default = `TRUE.` | ||
#' @param drop logical. If `TRUE` (default), drop columns not found in SQL table. | ||
#' @inheritParams db.query | ||
#' @inherit db.query return | ||
#' @export | ||
#' @examples | ||
#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:") | ||
#' dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE) | ||
#' insert_table(iris[-1,], "iris", irisdb$con) | ||
#' dplyr::tbl(irisdb, "iris") | ||
insert_table <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) { | ||
values_fixed <- match_dbcols(values, table, con, coerce_col_class, drop = TRUE) | ||
insert_query <- build_insert_query(values_fixed, table, .con = con) | ||
db.query(insert_query, con) | ||
} | ||
|
||
#' Match column names and classes between local and SQL table | ||
#' | ||
#' @inheritParams insert_table | ||
#' @return `values` `data.frame` with column names and classes matched to SQL | ||
#' @export | ||
match_dbcols <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) { | ||
use_cols <- match_colnames(values, table, con) | ||
if (length(use_cols) < 1) { | ||
PEcAn.logger::logger.severe( | ||
"No columns match between input and target table." | ||
) | ||
} | ||
PEcAn.logger::logger.debug( | ||
"Matched the following cols: ", | ||
paste(use_cols, collapse = ", ") | ||
) | ||
values_sub <- values[, use_cols] | ||
# Load one row to get column types | ||
sql_row <- dplyr::tbl(con, table) %>% head(1) %>% dplyr::collect() | ||
sql_types <- purrr::map(sql_row, class) %>% | ||
purrr::map_chr(1) %>% | ||
.[use_cols] | ||
values_types <- purrr::map(values_sub, class) %>% purrr::map_chr(1) | ||
type_mismatch <- sql_types != values_types | ||
if (sum(type_mismatch) > 0) { | ||
mismatch_string <- sprintf( | ||
"%s: local is %s, SQL is %s", | ||
names(values_types), | ||
values_types, | ||
sql_types | ||
)[type_mismatch] | ||
PEcAn.logger::logger.info( | ||
"Found type mismatches in the following columns: ", | ||
paste0(mismatch_string, collapse = "; ") | ||
) | ||
if (!coerce_col_class) { | ||
PEcAn.logger::logger.severe( | ||
"Type mismatch detected, and `coerce_col_class` is `FALSE`. ", | ||
"Fix column class mismatches manually." | ||
) | ||
} else { | ||
PEcAn.logger::logger.info( | ||
"Coercing local column types to match SQL." | ||
) | ||
# Coerce values data frame to these types | ||
values_fixed <- purrr::map2_dfc(values_sub, sql_types, as) | ||
} | ||
} else { | ||
values_fixed <- values_sub | ||
} | ||
if (drop) { | ||
values_fixed | ||
} else { | ||
drop_cols <- colnames(values)[!colnames(values) %in% use_cols] | ||
dplyr::bind_cols(values_fixed, values[, drop_cols]) | ||
} | ||
} | ||
|
||
#' Match names of local data frame to SQL table | ||
#' | ||
#' @inheritParams insert_table | ||
match_colnames <- function(values, table, con) { | ||
tbl_db <- dplyr::tbl(con, table) | ||
table_cols <- dplyr::tbl_vars(tbl_db) | ||
values_cols <- colnames(values) | ||
intersect(values_cols, table_cols) | ||
} | ||
|
||
#' Build query to insert R data frame into SQL table | ||
#' | ||
#' @inheritParams insert_table | ||
#' @inheritParams glue::glue_sql | ||
build_insert_query <- function(values, table, .con) { | ||
value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ])) | ||
|
||
insert_list <- value_list %>% | ||
purrr::map(unname) %>% | ||
purrr::map(dbplyr::escape) %>% | ||
purrr::map(dbplyr::sql_vector) | ||
|
||
glue::glue_sql( | ||
"INSERT INTO {`table`} ({`colnames(values)`*}) ", | ||
"VALUES {insert_list*}", | ||
.con = .con | ||
) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#' Perform crossref search for a list of references | ||
#' | ||
#' @param queries Character vector of queries | ||
#' @inheritDotParams search_reference_single | ||
#' @inherit search_reference_single description return | ||
#' @export | ||
search_references <- function(queries, ...) { | ||
search_fun <- search_reference_single %>% | ||
purrr::partial(...) %>% | ||
purrr::possibly(otherwise = data.frame(title = "Not found")) | ||
encodeString(queries) %>% | ||
purrr::map_dfr(search_fun) | ||
} | ||
|
||
#' Perform crossref search for a single reference | ||
#' | ||
#' Requires the `rcrossref` package. | ||
#' | ||
#' @param query Citation string (length 1) to search for DOI | ||
#' @param min_score Minimum match score. Default (85) is fairly strict. | ||
#' @param limit Number of results to return | ||
#' @return `data.frame` containing crossref information converted to match bety citations table. | ||
search_reference_single <- function(query, limit = 1, min_score = 85) { | ||
stopifnot(length(query) == 1) | ||
PEcAn.logger::logger.debug("Processing query:\n", query) | ||
crsearch <- rcrossref::cr_works(query = query, limit = limit) | ||
if (is.null(crsearch[["data"]])) { | ||
PEcAn.logger::logger.warn( | ||
"Error in crossref query. ", | ||
"Setting title to search string and leaving other fields blank." | ||
) | ||
return(tibble::tibble(query = query)) | ||
} | ||
crdata <- crsearch[["data"]] %>% | ||
dplyr::mutate(score = as.numeric(score)) %>% | ||
dplyr::filter(score > min_score) | ||
if (nrow(crdata) < 1) { | ||
PEcAn.logger::logger.info( | ||
"No matches found. ", | ||
"Setting title to search string and leaving other fields blank.") | ||
return(tibble::tibble(query = query)) | ||
} | ||
keep_cols <- c( | ||
"author", | ||
"year", | ||
"title", | ||
journal = "container.title", | ||
vol = "volume", | ||
pg = "page", | ||
doi = "DOI", | ||
"score", | ||
"query" | ||
) | ||
proc_search <- crdata %>% | ||
dplyr::mutate( | ||
# Get the first author only -- this is the BETY format | ||
author_family = purrr::map(author, list("family", 1)), | ||
author_given = purrr::map(author, list("given", 1)), | ||
author = paste(author_family, author_given, sep = ", "), | ||
year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(), | ||
query = query, | ||
score = as.numeric(score) | ||
) | ||
use_cols <- keep_cols[keep_cols %in% colnames(proc_search)] | ||
dplyr::select(proc_search, !!!use_cols) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
#' Convert TRY text file to SQLite database | ||
#' | ||
#' The TRY file is huge and unnecessarily long, which makes it difficult to | ||
#' work with. The resulting SQLite database is much smaller on disk, and can be | ||
#' read much faster thanks to lazy evaluation. | ||
#' | ||
#' The resulting TRY SQLite database contains the following tables: | ||
#' - `values` -- The actual TRY data. Links to all other tables through ID columns. | ||
#' - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table. | ||
#' - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`. | ||
#' - `species` -- Species. Links to `values` through `AccSpeciesID`. | ||
#' | ||
#' @param try_files Character vector of file names containing TRY data. | ||
#' Multiple files are combined with `data.table::rbindlist`. | ||
#' @param sqlite_file Target SQLite database file name, as character. | ||
#' @export | ||
try2sqlite <- function(try_files, sqlite_file = "try.sqlite") { | ||
# Read files | ||
PEcAn.logger::logger.info("Reading in TRY data...") | ||
raw_data <- Map(data.table::fread, try_files) %>% | ||
data.table::rbindlist() | ||
|
||
# Create integer reference ID for compact storage | ||
PEcAn.logger::logger.info("Adding ReferenceID column") | ||
raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]])) | ||
|
||
# Create tables | ||
PEcAn.logger::logger.info("Extracting data values table.") | ||
data_cols <- c( | ||
"ObsDataID", # TRY row ID -- unique to each observation of a given trait | ||
"ObservationID", # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf) | ||
"DataID", # Links to data ID | ||
"StdValue", # Standardized, QA-QC'ed value | ||
"UnitName", # Standardized unit | ||
"AccSpeciesID", # Link to 'species' table | ||
"DatasetID", # Link to 'datasets' table. | ||
"ReferenceID", # Link to 'try_references' table. | ||
"ValueKindName", # Type of value, e.g. mean, min, max, etc. | ||
"UncertaintyName", # Kind of uncertainty | ||
"Replicates", # Number of replicates | ||
"RelUncertaintyPercent", | ||
"OrigValueStr", # Original data, as character string (before QA/QC) | ||
"OrigUnitStr", # Original unit, as character string (before QA/QC) | ||
"OrigUncertaintyStr" # Original uncertainty, as character string (before QA/QC) | ||
) | ||
data_values <- unique(raw_data[, data_cols, with = FALSE]) | ||
|
||
PEcAn.logger::logger.info("Extrating datasets table...") | ||
datasets_cols <- c( | ||
"DatasetID", | ||
"Dataset", | ||
"LastName", | ||
"FirstName", | ||
"Reference", | ||
"ReferenceID" | ||
) | ||
datasets_values <- unique(raw_data[, datasets_cols, with = FALSE]) | ||
|
||
PEcAn.logger::logger.info("Extracting traits table...") | ||
traits_cols <- c( | ||
"DataID", | ||
"DataName", | ||
"TraitID", | ||
"TraitName" | ||
) | ||
traits_values <- unique(raw_data[, traits_cols, with = FALSE]) | ||
|
||
PEcAn.logger::logger.info("Extracting species table...") | ||
species_cols <- c( | ||
"AccSpeciesID", | ||
"AccSpeciesName", | ||
"SpeciesName" | ||
) | ||
species_values <- unique(raw_data[, species_cols, with = FALSE]) | ||
|
||
PEcAn.logger::logger.info("Writing tables to SQLite database...") | ||
con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file) | ||
on.exit(DBI::dbDisconnect(con)) | ||
PEcAn.logger::logger.info("Writing values table...") | ||
DBI::dbWriteTable(con, "values", data_values) | ||
PEcAn.logger::logger.info("Writing traits table...") | ||
DBI::dbWriteTable(con, "traits", traits_values) | ||
PEcAn.logger::logger.info("Writing datasets table...") | ||
DBI::dbWriteTable(con, "datasets", datasets_values) | ||
PEcAn.logger::logger.info("Writing species table...") | ||
DBI::dbWriteTable(con, "species", species_values) | ||
|
||
PEcAn.logger::logger.info("Done creating TRY SQLite database!") | ||
|
||
NULL | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,9 @@ | ||
##' Imports from other packages | ||
##' | ||
##' @importFrom magrittr `%>%` | ||
#' Imports from other packages | ||
#' | ||
#' @name otherimports | ||
#' @importFrom magrittr %>% | ||
#' @export | ||
magrittr::`%>%` | ||
|
||
#' @rdname otherimports | ||
#' @importFrom rlang !! !!! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm only 60% sure on this, but my impression is that the tidyverse package is analogous to PEcAn.all -- i.e. it exists for load-time convenience and for use in packages it's better to list the specific packages used -- Looks like tibble, tidyr, purrr?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Point taken, but it is only a
Suggests
rather than aImports/Depends
, and this way, it matches with what I'm doing in the scripts. Though, grudgingly, I'll admit it is good coding practice to be as explicit as possible about where things are coming from.In general, my approach is to stick packages required for scripts but not internal functions in
Suggests
, since I think those aren't installed unless you explicitly ask for them indevtools::install
(right?).Also, as an aside, get a load of the
import
package, which brings pythonicfrom Z import X as Y
syntax to R. I might switch to using this in a bunch of my interactive scripts.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fair point on suggests vs imports, but I see calls to all three of tibble, purrr, tidyr in
R/
, so those should probably be in imports anyway 😉