This repository has been archived by the owner on Apr 15, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2021-09-28_jessica_dfci_panc_complete_col.R
107 lines (83 loc) · 3.21 KB
/
2021-09-28_jessica_dfci_panc_complete_col.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# Description:
# Author: Haley Hunter-Zinck
# Date:
# setup ----------------------------
tic = as.double(Sys.time())
library(glue)
library(dplyr)
library(synapser)
synLogin()
# synapse
synid_table_diag <- "syn21446701"
synid_dfci_panc_upload <- list(data1 = "syn25544638", header1 = "syn25544639")
synid_msk_panc_upload <- list(data1 = "syn25541828")
synid_file_panc_intake <- "syn24175803"
synid_file_panc_rcc <- "syn25578183"
# parameters
site_check = "DFCI"
cohort = "PANC"
site_control = "MSK"
# functions ----------------------------
get_synapse_entity_data_in_csv <- function(synapse_id, sep = ",", na.strings = c("NA")) {
data <- read.csv(synGet(synapse_id)$path, stringsAsFactors = F,
na.strings = na.strings, sep = sep)
return(data)
}
get_synapse_entity_name <- function(synapse_id) {
return(synGet(synapse_id, downloadFile = F)$properties$name)
}
# read ----------------------------
data_check <- get_synapse_entity_data_in_csv(synid_dfci_panc_upload$data1)
colnames(data_check) <- get_synapse_entity_data_in_csv(synid_dfci_panc_upload$header1)
data_control <- get_synapse_entity_data_in_csv(synid_msk_panc_upload$data1)
data_intake <- get_synapse_entity_data_in_csv(synid_file_panc_intake)
data_rcc <- data <- read.csv(synGet(synid_file_panc_rcc)$path, stringsAsFactors = F,
na.strings = c("NA"), skip = 36, sep = "\t", check.names = T)
head(colnames(data_rcc))
# main ----------------------------
# check tables
query <- glue("SELECT cancer_diagnosis_complete, redcap_data_access_group, COUNT(*) FROM {synid_table_diag} WHERE cohort = '{cohort}' GROUP BY cancer_diagnosis_complete, redcap_data_access_group")
res <- as.data.frame(synTableQuery(query, includeRowIdAndRowVersion = F))
print(res)
# check upload file
data_check %>%
filter(grepl(pattern = site_check, x = record_id)) %>%
filter(redcap_repeat_instrument == "cancer_diagnosis") %>%
select(cancer_diagnosis_complete) %>%
group_by(cancer_diagnosis_complete) %>%
count()
# check control upload file
data_control %>%
filter(grepl(pattern = site_control, x = record_id)) %>%
filter(redcap_repeat_instrument == "cancer_diagnosis") %>%
select(cancer_diagnosis_complete) %>%
group_by(cancer_diagnosis_complete) %>%
count()
# check intake for dfci
data_intake %>%
filter(redcap_repeat_instrument == "Cancer Diagnosis") %>%
filter(redcap_data_access_group == site_check) %>%
select(cancer_diagnosis_complete) %>%
group_by(cancer_diagnosis_complete) %>%
count()
data_intake %>%
filter(redcap_repeat_instrument == "Cancer Diagnosis") %>%
filter(redcap_data_access_group == site_control) %>%
select(cancer_diagnosis_complete) %>%
group_by(cancer_diagnosis_complete) %>%
count()
data_rcc %>%
filter(Event.Name.Occurrence. == "cancer_diagnosis") %>%
filter(Site.Name == site_check) %>%
select(cancer_diagnosis_complete) %>%
group_by(cancer_diagnosis_complete) %>%
count()
data_rcc %>%
filter(Event.Name.Occurrence. == "cancer_diagnosis") %>%
filter(Site.Name == site_control) %>%
select(cancer_diagnosis_complete) %>%
group_by(cancer_diagnosis_complete) %>%
count()
# close out ----------------------------
toc = as.double(Sys.time())
print(glue("Runtime: {round(toc - tic)} s"))