This repository has been archived by the owner on Apr 15, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path2021-09-22_alyssa_dfci_prostate_upload.R
82 lines (62 loc) · 3.33 KB
/
2021-09-22_alyssa_dfci_prostate_upload.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# Description: Determine why 2021-09-21 DFCI prostate upload is missing sample IDs.
# Author: Haley Hunter-Zinck
# Date: 2021-09-22
# Request:
# Source: slack message from Alyssa highlighting the issue in the QA reports
# Message: I am having trouble shooting the issues found in https://www.synapse.org/#!Synapse:syn25931845
# as it seems that the records for the missing sample IDs do have sample IDs
# user input ----------------------------
cohort = "Prostate"
site = "DFCI"
# setup ----------------------------
tic = as.double(Sys.time())
library(glue)
library(dplyr)
library(data.table)
library(synapser)
synLogin()
# synapse
synid_table_upload <- "syn25892806"
synid_table_genie_sample <- "syn7517674"
# storage
synid_file_upload <- c()
# functions ----------------------------
# read ----------------------------
# read data part 1
query <- glue("SELECT data1 FROM {synid_table_upload} WHERE cohort = '{cohort}' AND site = '{site}'")
ent_id <- synTableQuery(query, includeRowIdAndRowVersion=FALSE)
synid_file_upload = append(synid_file_upload, as.character(read.csv(ent_id$filepath, na.strings = c(""), stringsAsFactors = FALSE)))
ent_data <- synGet(tail(synid_file_upload,1))
data1 <- read.csv(ent_data$path, check.names = FALSE, na.strings = c(""), stringsAsFactors = FALSE)
# read associated header, part 1
query <- glue("SELECT header1 FROM {synid_table_upload} WHERE cohort = '{cohort}' AND site = '{site}'")
ent_id <- synTableQuery(query, includeRowIdAndRowVersion=FALSE)
synid_file_upload = append(synid_file_upload, as.character(read.csv(ent_id$filepath, na.strings = c(""), stringsAsFactors = FALSE)))
ent_data <- synGet(tail(synid_file_upload,1))
colnames(data1) <- as.character(read.csv(ent_data$path, check.names = FALSE, na.strings = c(""), stringsAsFactors = FALSE))
# set up data matrix
data <- data1
setDT(data)
# main ----------------------------
# Number of Records
print(paste("Number of unique patient IDs for", site, ": ", length(unique(data$record_id)) ))
# Check if IDs match Main Genie list --- Sample IDs
sample_id_info <- data[redcap_repeat_instrument=='cancer_panel_test',list(record_id, redcap_repeat_instance, cpt_genie_sample_id)]
sample_id_list <- sample_id_info$cpt_genie_sample_id
temp <- toString(sample_id_list)
temp <- sapply(strsplit(temp, '[, ]+'), function(x) toString(shQuote(x)))
sample_in_genie <-synTableQuery(glue("select SAMPLE_ID from {synid_table_genie_sample}
where SAMPLE_ID in ({temp})"))
sample_in_genie <- sample_in_genie$asDataFrame()$SAMPLE_ID
sample_not_in_genie <- sample_id_info[!sample_id_info$cpt_genie_sample_id %in% sample_in_genie,]
# look at sample info for records missing sample IDs
res<- sample_id_info[is.element(record_id, sample_not_in_genie$record_id),]
# summary ------------------------------
n_na_repeat_1 <- length(which(res$redcap_repeat_instance[is.na(res$cpt_genie_sample_id)] == 1))
n_na_repeat_2_plus <- length(which(res$redcap_repeat_instance[is.na(res$cpt_genie_sample_id)] > 1))
print(glue("Number of repeat instance 1 that have sample ID as NA: {n_na_repeat_1}"))
print(glue("Number of repeat instance 2+ that have sample ID as NA: {n_na_repeat_2_plus}"))
table(res$redcap_repeat_instance[is.na(res$cpt_genie_sample_id)])
# close out ----------------------------
toc = as.double(Sys.time())
print(glue("Runtime: {round(toc - tic)} s"))