From 72fccdd6282fbe9b25699ab322241164cc12080c Mon Sep 17 00:00:00 2001 From: Dan Chaltiel <15105152+DanChaltiel@users.noreply.github.com> Date: Fri, 27 Dec 2024 10:15:40 +0100 Subject: [PATCH] compare_databases (WIP) --- NEWS.md | 1 + R/compare_databases.R | 103 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+) create mode 100644 R/compare_databases.R diff --git a/NEWS.md b/NEWS.md index f06bc68..b8c2113 100644 --- a/NEWS.md +++ b/NEWS.md @@ -6,6 +6,7 @@ EDCimport is a package designed to easily import data from EDC software TrialMas ### New features +- New function `compare_databases()`, which compares the structure of several extractions of a database: added/removed columns, number of patients, etc. (#26) - New function `edc_patient_gridplot()`, which creates a ggplot matrix giving the presence of all patients in all datasets (#77) - Improved `lastnews_table()`: allow regex in except & prefer, improved warning message, and allow saving warning as csv (#78) - New argument `subdirectories` to all reading functions (`read_trialmaster()`, `read_all_xpt()`, `read_all_sas()`, and `read_all_csv()`), to control whether to read sub-directories. Note that until now, those subdirectories were read and could overwrite root files. diff --git a/R/compare_databases.R b/R/compare_databases.R new file mode 100644 index 0000000..0efd40b --- /dev/null +++ b/R/compare_databases.R @@ -0,0 +1,103 @@ + + + +compare_databases = function(data_list){ + library(flextable) + library(tidyverse) + library(patchwork) + data_list = read_hrnbl2_multi() + tm_all = data_list + + + # x = compare_databases(tm_all) + + + tm_names_all = tm_all %>% map(names) %>% + unlist() %>% unique() %>% sort() %>% + setdiff(c(".lookup", "date_extraction", "datetime_extraction")) + + + # Table present / absent ---------------------------------------------------------------------- + + table_presence = tm_all %>% + map(~{ + ifelse(tm_names_all %in% names(.x), "Present", "Absent") + }) %>% + bind_cols(dataset=tm_names_all, .) + + table_presence %>% + flextable() %>% + bg(j=-1, bg=function(x)ifelse(x=="Present", "darkgreen", "red")) + + + + # Lookups ------------------------------------------------------------------------------------- + + + lk = tm_all %>% + map(".lookup") %>% + bind_rows(.id="source") %>% + tidyr::complete(source, dataset) %>% + arrange(dataset) %>% + as_tibble() + + ## Graphique d'évolution de nrow, ncol, n_id, rows_per_id ----- + + ### as facets ----- + fig1 = lk %>% + select(source, dataset, nrow, ncol, n_id, rows_per_id) %>% + pivot_longer(-c(source, dataset)) %>% + ggplot() + + aes(x=value, y=dataset, color=source, group=dataset) + + geom_point(na.rm=TRUE) + geom_line(na.rm=TRUE) + + facet_wrap(~name, scales="free_x") + + ### as list ----- + v = c("Number of rows"="nrow", "Number of columns"="ncol", + "Number of patients"="n_id", "Number of rows per patient"="rows_per_id") + fig1b = v %>% imap(~{ + lk %>% + select(source, dataset, value=all_of(unname(.x))) %>% + ggplot() + + aes(x=value, y=dataset, color=source, group=dataset) + + geom_point(na.rm=TRUE) + geom_line(na.rm=TRUE) + + xlab(.y) + + theme(legend.position="top") + }) + fig1b %>% wrap_plots(guides="collect") & theme(legend.position="top") + + + + ## Evolution des names/labels ----- + lk2 = lk %>% + select(source, dataset, names) %>% + arrange(dataset, source) %>% + mutate( + added = purrr::map2_chr(names, lag(names), \(cur, prev){ + if(is.null(prev)) return(NA) + setdiff(cur, prev) %>% paste(collapse=", ") %>% na_if("") + }), + removed = purrr::map2_chr(names, lag(names), \(cur, prev){ + if(is.null(prev)) return(NA) + setdiff(prev, cur) %>% paste(collapse=", ") %>% na_if("") + }), + .by=dataset + ) %>% + select(-names) + + lk2 %>% + # filter(any(!is.na(added)) | any(!is.na(removed)), + # .by=dataset) %>% + filter(!is.na(added) | !is.na(removed)) %>% + arrange(source) %>% + flextable() %>% autofit() %>% + merge_v(j="source") %>% + hline(i=~source!=lead(source, default=source[1])) %>% + hline(j="source") %>% + fix_border_issues() + + + + + +}