PecanProject · mdietze · Feb 9, 2018 · Feb 1, 2018 · Feb 1, 2018 · Feb 1, 2018
diff --git a/base/db/.Rbuildignore b/base/db/.Rbuildignore
@@ -1,2 +1,3 @@
 ^.*\.Rproj$
 ^\.Rproj\.user$
+try\.sqlite
diff --git a/base/db/.gitignore b/base/db/.gitignore
@@ -0,0 +1,2 @@
+try.sqlite
+inst/import-try/data-proc
diff --git a/base/db/DESCRIPTION b/base/db/DESCRIPTION
@@ -18,14 +18,23 @@ Imports:
     PEcAn.utils,
     dbplyr (>= 1.2.0),
     dplyr,
+    tibble,
+    purrr,
+    tidyr,
+    glue,
     lubridate,
     magrittr,
     ncdf4,
     plyr (>= 1.8.4),
     udunits2
 Suggests:
     RPostgreSQL,
-    testthat (>= 1.0.2)
+    RSQLite,
+    testthat (>= 1.0.2),
+    tidyverse,
+    data.table,
+    rcrossref,
+    here
 License: FreeBSD + file LICENSE
 Copyright: Authors
 LazyLoad: yes

diff --git a/base/db/NAMESPACE b/base/db/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export("%>%")
 export(append.covariate)
 export(assign.treatments)
 export(bety2pecan)
@@ -12,6 +13,7 @@ export(db.print.connections)
 export(db.query)
 export(db.showQueries)
 export(dbHostInfo)
+export(db_merge_into)
 export(dbfile.check)
 export(dbfile.file)
 export(dbfile.id)
@@ -32,7 +34,9 @@ export(get_run_ids)
 export(get_users)
 export(get_var_names)
 export(get_workflow_ids)
+export(insert_table)
 export(load_data_single_run)
+export(match_dbcols)
 export(ncdays2date)
 export(query.file.path)
 export(query.format.vars)
@@ -43,7 +47,9 @@ export(query.trait.data)
 export(query.traits)
 export(rename_jags_columns)
 export(runs)
+export(search_references)
 export(take.samples)
+export(try2sqlite)
 export(var_names_all)
 export(workflow)
 export(workflows)

diff --git a/base/db/R/db_merge_into.R b/base/db/R/db_merge_into.R
@@ -0,0 +1,29 @@
+#' Merge local data frame into SQL table
+#'
+#' @inheritParams insert_table
+#' @inheritDotParams insert_table
+#' @param by Character vector of columns by which to perform merge. Defaults to all columns in `values`
+#' @return Data frame: Inner join of SQL table and input data frame (as unevaluated "lazy query" table)
+#' @export
+#' @examples
+#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+#' dplyr::copy_to(irisdb, iris[1:10,], name = "iris", overwrite = TRUE)
+#' db_merge_into(iris[1:12,], "iris", irisdb)
+#' dplyr::tbl(irisdb, "iris") %>% dplyr::count()
+db_merge_into <- function(values, table, con, by = NULL, drop = FALSE, ...) {
+  values_fixed <- match_dbcols(values, table, con, drop = FALSE)
+  if (is.null(by)) {
+    by <- match_colnames(values, table, con)
+  }
+  sql_tbl <- dplyr::tbl(con, table)
+  values_merge <- dplyr::anti_join(values_fixed, sql_tbl, by = by, copy = TRUE)
+  if (nrow(values_merge) < 1 || ncol(values_merge) < 1) {
+    PEcAn.logger::logger.warn(
+      "Input table for merge is empty."
+    )
+  } else {
+    insert <- insert_table(values_merge, table, con, ...)
+  }
+  dplyr::tbl(con, table) %>%
+    dplyr::inner_join(values_fixed, copy = TRUE)
+}
diff --git a/base/db/R/insert_table.R b/base/db/R/insert_table.R
@@ -0,0 +1,112 @@
+#' Insert R data frame into SQL database
+#'
+#' First, subset to matching columns. Then, make sure the local and SQL column 
+#' classes match, coercing local to SQL as necessary (or throwing an error). 
+#' Then, build an SQL string for the insert statement. Finally, insert into the 
+#' database.
+#'
+#' @param values `data.frame` of values to write to SQL database
+#' @param table Name of target SQL table, as character
+#' @param coerce_col_class logical, whether or not to coerce local data columns 
+#' to SQL classes. Default = `TRUE.`
+#' @param drop logical. If `TRUE` (default), drop columns not found in SQL table.
+#' @inheritParams db.query
+#' @inherit db.query return
+#' @export
+#' @examples
+#' irisdb <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
+#' dplyr::copy_to(irisdb, iris[1,], name = "iris", overwrite = TRUE)
+#' insert_table(iris[-1,], "iris", irisdb$con)
+#' dplyr::tbl(irisdb, "iris")
+insert_table <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) {
+  values_fixed <- match_dbcols(values, table, con, coerce_col_class, drop = TRUE)
+  insert_query <- build_insert_query(values_fixed, table, .con = con)
+  db.query(insert_query, con)
+}
+
+#' Match column names and classes between local and SQL table
+#'
+#' @inheritParams insert_table
+#' @return `values` `data.frame` with column names and classes matched to SQL
+#' @export
+match_dbcols <- function(values, table, con, coerce_col_class = TRUE, drop = TRUE) {
+  use_cols <- match_colnames(values, table, con)
+  if (length(use_cols) < 1) {
+    PEcAn.logger::logger.severe(
+      "No columns match between input and target table."
+    )
+  }
+  PEcAn.logger::logger.debug(
+    "Matched the following cols: ",
+    paste(use_cols, collapse = ", ")
+  )
+  values_sub <- values[, use_cols]
+  # Load one row to get column types
+  sql_row <- dplyr::tbl(con, table) %>% head(1) %>% dplyr::collect()
+  sql_types <- purrr::map(sql_row, class) %>%
+    purrr::map_chr(1) %>%
+    .[use_cols]
+  values_types <- purrr::map(values_sub, class) %>% purrr::map_chr(1)
+  type_mismatch <- sql_types != values_types
+  if (sum(type_mismatch) > 0) {
+    mismatch_string <- sprintf(
+      "%s: local is %s, SQL is %s",
+      names(values_types),
+      values_types,
+      sql_types
+    )[type_mismatch]
+    PEcAn.logger::logger.info(
+      "Found type mismatches in the following columns: ",
+      paste0(mismatch_string, collapse = "; ")
+    )
+    if (!coerce_col_class) {
+      PEcAn.logger::logger.severe(
+        "Type mismatch detected, and `coerce_col_class` is `FALSE`. ",
+        "Fix column class mismatches manually."
+      )
+    } else {
+      PEcAn.logger::logger.info(
+        "Coercing local column types to match SQL."
+      )
+      # Coerce values data frame to these types
+      values_fixed <- purrr::map2_dfc(values_sub, sql_types, as)
+    }
+  } else {
+    values_fixed <- values_sub
+  }
+  if (drop) {
+    values_fixed
+  } else {
+    drop_cols <- colnames(values)[!colnames(values) %in% use_cols]
+    dplyr::bind_cols(values_fixed, values[, drop_cols])
+  }
+}
+
+#' Match names of local data frame to SQL table
+#'
+#' @inheritParams insert_table
+match_colnames <- function(values, table, con) {
+  tbl_db <- dplyr::tbl(con, table)
+  table_cols <- dplyr::tbl_vars(tbl_db)
+  values_cols <- colnames(values)
+  intersect(values_cols, table_cols)
+}
+
+#' Build query to insert R data frame into SQL table
+#'
+#' @inheritParams insert_table
+#' @inheritParams glue::glue_sql
+build_insert_query <- function(values, table, .con) {
+  value_list <- purrr::map(seq_len(nrow(values)), ~as.list(values[.x, ]))
+
+  insert_list <- value_list %>%
+    purrr::map(unname) %>%
+    purrr::map(dbplyr::escape) %>%
+    purrr::map(dbplyr::sql_vector)
+
+  glue::glue_sql(
+    "INSERT INTO {`table`} ({`colnames(values)`*}) ",
+    "VALUES {insert_list*}",
+    .con = .con
+  )
+}
diff --git a/base/db/R/search_references.R b/base/db/R/search_references.R
@@ -0,0 +1,67 @@
+#' Perform crossref search for a list of references
+#'
+#' @param queries Character vector of queries
+#' @inheritDotParams search_reference_single
+#' @inherit search_reference_single description return
+#' @export
+search_references <- function(queries, ...) {
+  search_fun <- search_reference_single %>%
+    purrr::partial(...) %>%
+    purrr::possibly(otherwise = data.frame(title = "Not found"))
+  encodeString(queries) %>%
+    purrr::map_dfr(search_fun)
+}
+
+#' Perform crossref search for a single reference
+#'
+#' Requires the `rcrossref` package.
+#'
+#' @param query Citation string (length 1) to search for DOI
+#' @param min_score Minimum match score. Default (85) is fairly strict.
+#' @param limit Number of results to return
+#' @return `data.frame` containing crossref information converted to match bety citations table.
+search_reference_single <- function(query, limit = 1, min_score = 85) {
+  stopifnot(length(query) == 1)
+  PEcAn.logger::logger.debug("Processing query:\n", query)
+  crsearch <- rcrossref::cr_works(query = query, limit = limit)
+  if (is.null(crsearch[["data"]])) {
+    PEcAn.logger::logger.warn(
+      "Error in crossref query. ",
+      "Setting title to search string and leaving other fields blank."
+    )
+    return(tibble::tibble(query = query))
+  }
+  crdata <- crsearch[["data"]] %>%
+    dplyr::mutate(score = as.numeric(score)) %>%
+    dplyr::filter(score > min_score)
+  if (nrow(crdata) < 1) {
+    PEcAn.logger::logger.info(
+      "No matches found. ",
+      "Setting title to search string and leaving other fields blank.")
+    return(tibble::tibble(query = query))
+  }
+  keep_cols <- c(
+    "author",
+    "year",
+    "title",
+    journal = "container.title",
+    vol = "volume",
+    pg = "page",
+    doi = "DOI",
+    "score",
+    "query"
+  )
+  proc_search <- crdata %>%
+    dplyr::mutate(
+      # Get the first author only -- this is the BETY format
+      author_family = purrr::map(author, list("family", 1)),
+      author_given = purrr::map(author, list("given", 1)),
+      author = paste(author_family, author_given, sep = ", "),
+      year = gsub("([[:digit:]]{4}).*", "\\1", issued) %>% as.numeric(),
+      query = query,
+      score = as.numeric(score)
+    )
+  use_cols <- keep_cols[keep_cols %in% colnames(proc_search)]
+  dplyr::select(proc_search, !!!use_cols)
+}
+
diff --git a/base/db/R/try2sqlite.R b/base/db/R/try2sqlite.R
@@ -0,0 +1,91 @@
+#' Convert TRY text file to SQLite database
+#'
+#' The TRY file is huge and unnecessarily long, which makes it difficult to 
+#' work with. The resulting SQLite database is much smaller on disk, and can be 
+#' read much faster thanks to lazy evaluation.
+#'
+#' The resulting TRY SQLite database contains the following tables:
+#'  - `values` -- The actual TRY data. Links to all other tables through ID columns.
+#'  - `traits` -- Description of trait and data names. Links to `values` through `DataID`. Similar to BETY `variables` table.
+#'  - `datasets` -- Description of datasets and references/citations. Links to `values` through `DatasetID` and `ReferenceID`.
+#'  - `species` -- Species. Links to `values` through `AccSpeciesID`.
+#'
+#' @param try_files Character vector of file names containing TRY data.  
+#' Multiple files are combined with `data.table::rbindlist`.
+#' @param sqlite_file Target SQLite database file name, as character.
+#' @export
+try2sqlite <- function(try_files, sqlite_file = "try.sqlite") {
+  # Read files
+  PEcAn.logger::logger.info("Reading in TRY data...")
+  raw_data <- Map(data.table::fread, try_files) %>%
+    data.table::rbindlist()
+
+  # Create integer reference ID for compact storage
+  PEcAn.logger::logger.info("Adding ReferenceID column")
+  raw_data[["ReferenceID"]] <- as.integer(factor(raw_data[["Reference"]]))
+
+  # Create tables
+  PEcAn.logger::logger.info("Extracting data values table.")
+  data_cols <- c(
+    "ObsDataID",        # TRY row ID -- unique to each observation of a given trait
+    "ObservationID",    # TRY "entity" ID -- identifies a set of trait measurements (e.g. leaf)
+    "DataID",           # Links to data ID
+    "StdValue",         # Standardized, QA-QC'ed value
+    "UnitName",         # Standardized unit
+    "AccSpeciesID",     # Link to 'species' table
+    "DatasetID",        # Link to 'datasets' table.
+    "ReferenceID",      # Link to 'try_references' table.
+    "ValueKindName",    # Type of value, e.g. mean, min, max, etc.
+    "UncertaintyName",  # Kind of uncertainty
+    "Replicates",       # Number of replicates
+    "RelUncertaintyPercent",
+    "OrigValueStr",         # Original data, as character string (before QA/QC)
+    "OrigUnitStr",          # Original unit, as character string (before QA/QC)
+    "OrigUncertaintyStr"    # Original uncertainty, as character string (before QA/QC)
+  )
+  data_values <- unique(raw_data[, data_cols, with = FALSE])
+
+  PEcAn.logger::logger.info("Extrating datasets table...")
+  datasets_cols <- c(
+    "DatasetID",
+    "Dataset",
+    "LastName",
+    "FirstName",
+    "Reference",
+    "ReferenceID"
+  )
+  datasets_values <- unique(raw_data[, datasets_cols, with = FALSE])
+
+  PEcAn.logger::logger.info("Extracting traits table...")
+  traits_cols <- c(
+    "DataID",
+    "DataName",
+    "TraitID",
+    "TraitName"
+  )
+  traits_values <- unique(raw_data[, traits_cols, with = FALSE])
+
+  PEcAn.logger::logger.info("Extracting species table...")
+  species_cols <- c(
+    "AccSpeciesID",
+    "AccSpeciesName",
+    "SpeciesName"
+  )
+  species_values <- unique(raw_data[, species_cols, with = FALSE])
+
+  PEcAn.logger::logger.info("Writing tables to SQLite database...")
+  con <- DBI::dbConnect(RSQLite::SQLite(), sqlite_file)
+  on.exit(DBI::dbDisconnect(con))
+  PEcAn.logger::logger.info("Writing values table...")
+  DBI::dbWriteTable(con, "values", data_values)
+  PEcAn.logger::logger.info("Writing traits table...")
+  DBI::dbWriteTable(con, "traits", traits_values)
+  PEcAn.logger::logger.info("Writing datasets table...")
+  DBI::dbWriteTable(con, "datasets", datasets_values)
+  PEcAn.logger::logger.info("Writing species table...")
+  DBI::dbWriteTable(con, "species", species_values)
+
+  PEcAn.logger::logger.info("Done creating TRY SQLite database!")
+
+  NULL
+}
diff --git a/base/db/R/zz.imports.R b/base/db/R/zz.imports.R
@@ -1,3 +1,9 @@
-##' Imports from other packages
-##'
-##' @importFrom magrittr `%>%`
+#' Imports from other packages
+#'
+#' @name otherimports
+#' @importFrom magrittr %>%
+#' @export
+magrittr::`%>%`
+
+#' @rdname otherimports
+#' @importFrom rlang !! !!!