d3b-center · jharenza · Jan 3, 2024 · Dec 30, 2023 · Dec 30, 2023 · Dec 30, 2023
diff --git a/.gitignore b/.gitignore
@@ -21,3 +21,8 @@ open_pbta_envs.txt
 
 # Everything in scratch
 .scratch/
+
+# Expression files in subset directories
+analyses/molecular-subtyping*/*subset/*rsem-tpm*
+
+
diff --git a/analyses/create-subset-files/01-get_biospecimen_identifiers.R b/analyses/create-subset-files/01-get_biospecimen_identifiers.R
diff --git a/analyses/create-subset-files/02-subset_files.R b/analyses/create-subset-files/02-subset_files.R
@@ -16,40 +16,8 @@ suppressWarnings(
 )
 suppressPackageStartupMessages(library(optparse))
 suppressPackageStartupMessages(library(data.table))
-suppressPackageStartupMessages(library(arrow))
 suppressPackageStartupMessages(options(readr.show_col_types = FALSE))
 
-write_maf_file <- function(maf_df, file_name, version_string) {
-  # Given a data.frame that contains the fields for a MAF file, write a gzipped
-  # MAF file and include the version information provided in version_string.
-  #
-  # Note: if file_name exists, it will be overwritten
-  #
-  # Args:
-  #   maf_df: A data.frame that contains the MAF info.
-  #   file_name: Output file name, including the full path.
-  #   version_string: the version string that will be written to the first line
-  #                   of the file at file_name
-  #
-  # Returns: intended to be used to write files only
-
-  # if the file name supplied to this function ends in `.gz`, take it out for
-  # the purposes of writeLines, etc.
-  # we'll gzip it at the end with R.utils::gzip and this extension is not needed
-  if (grepl(".gz", file_name)) {
-    file_name <- sub(".gz", "", file_name)
-  }
-
-  # write the version string to the top of the file
-  writeLines(version_string, con = file_name)
-
-  # write the tabular data of maf_df
-  readr::write_tsv(maf_df, path = file_name, append = TRUE, col_names = TRUE)
-
-  # now gzip the file
-  R.utils::gzip(file_name, overwrite = TRUE)
-}
-
 subset_files <- function(filename, biospecimen_ids, output_directory) {
   # given the full path to a file to be subset and the list of biospecimen ids
   # to use for subsetting, write a file of the same name to the output directory
@@ -75,30 +43,6 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
   # filtering strategy depends on the file type, mostly because how the sample
   # IDs change based on the file type -- that's why this logic is required
   if (grepl("snv", filename)) {
-    # if (grepl("hotspots", filename)) {
-    #   snv_file <- data.table::fread(filename,
-    #                                 skip = 1,  # skip version string
-    #                                 data.table = FALSE,
-    #                                 showProgress = FALSE)
-    #   # we need to obtain the version string from the first line of the MAF file
-    #   version_string <- readLines(filename, n = 1)
-    #   # filter + write to file with custom function
-    #   snv_file %>%
-    #     dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>%
-    #     write_maf_file(file_name = output_file,
-    #                    version_string = version_string)
-    #   snv_file %>%
-    #     dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>%
-    #     readr::write_tsv(output_file)
-    # } else {
-    #   # in a column 'Tumor_Sample_Barcode'
-    #   snv_file <- data.table::fread(filename, data.table = FALSE, 
-    #                                 showProgress = FALSE)
-    #   snv_file %>%
-    #     dplyr::filter(Tumor_Sample_Barcode %in% biospecimen_ids) %>%
-    #     readr::write_tsv(output_file)
-    # }
-    # in a column 'Tumor_Sample_Barcode'
     snv_file <- data.table::fread(filename, data.table = FALSE, 
                                   showProgress = FALSE)
     snv_file %>% 
@@ -133,7 +77,7 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
       fusion_file %>%
         dplyr::filter(Sample %in% biospecimen_ids |
                         # this is required for the the fusion-summary module and TP53 module
-                        grepl("RELA|MN1|EWSR1|FGFR1--TACC1|MYB--QKI|BRAF|TP53--TRPS1|TP53--PSMG4", FusionName)) %>%
+                        grepl("ZFTA|MN1|EWSR1|FGFR1--TACC1|MYB--QKI|BRAF|TP53--TRPS1|TP53--PSMG4", FusionName)) %>%
         readr::write_tsv(output_file)
     } else if (grepl("dgd", filename)) {
       fusion_file %>%
@@ -168,9 +112,14 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
       expression_file %>% dplyr::select(transcript_id, gene_symbol,
                                         !!!rlang::quos(any_of(biospecimen_ids))) %>% 
         readr::write_rds(output_file)
-    } else if (grepl("methyl", filename)) {
-      expression_file %>% dplyr::select(Probe_ID, 
-                                        !!!rlang::quos(any_of(biospecimen_ids))) %>% 
+   # } else if (grepl("methyl", filename)) {
+  #    expression_file %>% dplyr::select(Probe_ID, 
+   #                                     !!!rlang::quos(any_of(biospecimen_ids))) %>% 
+    #    readr::write_rds(output_file)
+    } else if (grepl("gtex", filename)) {
+      expression_file <- readr::read_rds(filename)
+      biospecimen_ids <- intersect(colnames(expression_file), biospecimen_ids)
+      expression_file %>% dplyr::select(!!!rlang::quos(any_of(biospecimen_ids))) %>% 
         readr::write_rds(output_file)
     } else {
       expression_file %>% dplyr::select(!!!rlang::quos(any_of(biospecimen_ids))) %>% 
@@ -182,12 +131,11 @@ subset_files <- function(filename, biospecimen_ids, output_directory) {
     independent_file %>% 
       dplyr::filter(Kids_First_Biospecimen_ID %in% biospecimen_ids) %>%
       readr::write_tsv(output_file)
-  } else if (grepl("splice-events-rmats", filename)) {
+ # } else if (grepl("splice-events-rmats", filename)) {
     # in a column 'sample_id'
-    rmats_file <- arrow::read_tsv_arrow(filename)
-    rmats_file %>% 
-      dplyr::filter(sample_id %in% biospecimen_ids) %>%
-      readr::write_tsv(output_file)
+  #  rmats_file <- vroom::vroom(filename) %>%
+   #   dplyr::filter(sample_id %in% biospecimen_ids) %>%
+    #  readr::write_tsv(output_file)
   } else {
     # error-handling
     stop("File type unrecognized by 'subset_files'")

diff --git a/analyses/create-subset-files/README.md b/analyses/create-subset-files/README.md
@@ -1,4 +1,4 @@
-## Steps for creating subset files for CI
+## Steps for creating subset files for GitHub Actions CI
 
 1. Update to the most recent release of the data by running `bash download-data.sh` in the root directory of the repository.
 2. Run the shell script to generate subset files (from the root directory of the repository):
@@ -21,6 +21,8 @@ Non-matched samples are also added to each file (10% of `--num_matched`), which
 Some files are copied over in their entirety (e.g., BED files).
 See `create_subset_files.sh` for more information.
 
+Note: `splice-events-rmats.tsv.gz` and all `methyl*` files are skipped in v13 due to large size and that no modules currently routinely utilize these files.
+
 #### Special considerations
 
 Certain analysis modules have required modifications to the subset file creation steps beyond randomly selecting participants.
@@ -55,6 +57,6 @@ Running the following from the root directory of the repository
 SKIP_SUBSETTING=1 ./analyses/create-subset-files/create_subset_files.sh
 ```
 
-will skip the subsetting file steps that are implemented in R and only copy files that are included in full (e.g., `pbta-histologies.tsv`) and generate a new `md5sum.txt`.
+will skip the subsetting file steps that are implemented in R and only copy files that are included in full (e.g., `histologies.tsv`) and generate a new `md5sum.txt`.
 This is intended to be used when the only files that need to be updated are those that are copied over without being reduced in size in anyway.
 
diff --git a/analyses/create-subset-files/biospecimen_ids_for_subset.RDS b/analyses/create-subset-files/biospecimen_ids_for_subset.RDS
diff --git a/analyses/create-subset-files/create_subset_files.sh b/analyses/create-subset-files/create_subset_files.sh
@@ -8,7 +8,7 @@ set -o pipefail
 
 # Set defaults for release and biospecimen file name
 BIOSPECIMEN_FILE=${BIOSPECIMEN_FILE:-biospecimen_ids_for_subset.RDS}
-RELEASE=${RELEASE:-v12}
+RELEASE=${RELEASE:-v13}
 NUM_MATCHED=${NUM_MATCHED:-15}
 
 # This option controls whether or not the two larger MAF files are skipped as
@@ -41,7 +41,6 @@ fi
 # download Illumina methylation annotations file if does not exist in data
 # from the data release s3 bucket
 URL="https://d3b-openaccess-us-east-1-prd-pbta.s3.amazonaws.com/open-targets"
-RELEASE="v12"
 PROBES="infinium.gencode.v39.probe.annotations.tsv.gz"
 if [ -f "${DATA_DIRECTORY}/${PROBES}" ]; then
     echo "${PROBES} exists, skip downloading"
@@ -101,6 +100,15 @@ cp $FULL_DIRECTORY/cnv-consensus-gistic.zip $SUBSET_DIRECTORY
 # all bed files
 cp $FULL_DIRECTORY/*.bed $SUBSET_DIRECTORY
 
+# DGD fusion file
+cp $FULL_DIRECTORY/fusion-dgd.tsv.gz $SUBSET_DIRECTORY
+
+# All proteomic files
+cp $FULL_DIRECTORY/*protein* $SUBSET_DIRECTORY
+
+# Full tumor only MAF (for now, it is small)
+cp $FULL_DIRECTORY/snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz $SUBSET_DIRECTORY
+
 # if the md5sum.txt file already exists, get rid of it
 cd $SUBSET_DIRECTORY
 rm -f md5sum.txt

diff --git a/analyses/efo-mondo-mapping/results/efo-mondo-map-prefill.tsv b/analyses/efo-mondo-mapping/results/efo-mondo-map-prefill.tsv
@@ -4,73 +4,70 @@ Acute Myeloid Leukemia	EFO_0000222	MONDO_0018874	NCIT_C3171
 Adamantinomatous Craniopharyngioma	EFO_1000069	MONDO_0002787	NCIT_C4726
 Adenocarcinoma	EFO_0000228	MONDO_0004970	NCIT_C2852
 Adrenocortical Carcinoma	EFO_1000796	MONDO_0006639	NCIT_C9325
-Anaplastic Large Cell Lymphoma	EFO_0003032	MONDO_0020325	NCIT_C3720
+Alveolar soft part sarcoma	NA	NA	NA
 Angiosarcoma	EFO_0003968	MONDO_0016982	NCIT_C3088
 Astroblastoma	MONDO_0016707	MONDO_0016707	NCIT_C4324
 Astrocytoma	EFO_0000272	MONDO_0019781	NCIT_C6958
 Atypical Teratoid Rhabdoid Tumor	EFO_1002008	MONDO_0020560	NCIT_C6906
-Atypical choroid plexus papilloma	MONDO_0002684	MONDO_0002684	NCIT_C53686
 B Acute Lymphoblastic Leukemia/Lymphoma	EFO_0000094	MONDO_0004967	NCIT_C8644
 Bladder Urothelial Carcinoma	EFO_0006544	MONDO_0005611	NCIT_C39851
 Breast Invasive Carcinoma	EFO_1000307	MONDO_0006256	NCIT_C9245
 Burkitt Leukemia/Lymphoma	EFO_0000309	MONDO_0007243	NCIT_C2912
-CIC-DUX4 Sarcoma	EFO_0000691	MONDO_0005089	NCIT_C165663
-CIC-rearranged sarcoma	NA	NA	NA
 CNS Burkitt's lymphoma	EFO_0000309	MONDO_0007243	NCIT_C2912
 CNS Embryonal tumor	EFO_0005784	MONDO_0018843	NCIT_C5398
 CNS Melanoma	EFO_0002617	MONDO_0005191	NCIT_C133504
 CNS neuroblastoma	EFO_0000621	MONDO_0006130	NCIT_C4826
-CNS tumor with BCOR internal tandem duplication	NA	NA	NA
-Cavernoma	EFO_1000151	MONDO_0003155	NCIT_C3086
 Central neurocytoma	EFO_1000856	MONDO_0019134	NCIT_C3791
 Cervical Squamous Cell Carcinoma and Endocervical Adenocarcinoma	EFO_1000162	MONDO_0006143	NCIT_C157526
 Cholangiocarcinoma	EFO_0005221	MONDO_0019087	NCIT_C4436
 Chondromyxoid fibroma	EFO_0000332	MONDO_0018447	NCIT_C3830
 Chordoma	Orphanet_178	MONDO_0008978	NCIT_C2947
-Choroid plexus carcinoma	MONDO_0016718	MONDO_0016718	NCIT_C4715
-Choroid plexus papilloma	EFO_1000177	MONDO_0009837	NCIT_C3698
 Choroid plexus tumor	EFO_0007206	MONDO_0016717	NCIT_C4533
 Chromophobe renal cell carcinoma	EFO_0000335	MONDO_0017885	NCIT_C4146
 Chronic Myelogenous Leukemia	EFO_0000339	MONDO_0011996	NCIT_C3174
 Clear cell sarcoma of the kidney	EFO_0000350	MONDO_0005006	NCIT_C4264
 Colon Adenocarcinoma	EFO_1001949	MONDO_0002271	NCIT_C4349
+Colon Carcinoma	NA	NA	NA
+Congenital malignant brain tumor	NA	NA	NA
 Craniopharyngioma	EFO_1000209	MONDO_0002787	NCIT_C2964
 Cutaneous Melanoma	EFO_0000389	MONDO_0005012	NCIT_C3510
+Desmoid-type fibromatosis	EFO_0009907	Orphanet_873	NCIT_C9182
 Desmoplastic infantile astrocytoma and ganglioglioma	MONDO_0016731	MONDO_0016731	NCIT_C4747
 Diffuse fibrillary astrocytoma	MONDO_0016688	MONDO_0016688	NCIT_C4322
 Diffuse hemispheric glioma	MONDO_0016680	MONDO_0016680	NA
 Diffuse intrinsic pontine glioma	EFO_1000026	MONDO_0006033	NCIT_C94764
 Diffuse leptomeningeal glioneuronal tumor	MONDO_0016745	MONDO_0016745	NCIT_C129424
 Diffuse midline glioma	EFO_1000026	MONDO_0006033	NCIT_C129309
 Dysembryoplastic neuroepithelial tumor	EFO_0005551	MONDO_0005505	NCIT_C9505
-Dysgerminoma	MONDO_0003002	MONDO_0003002	NCIT_C2996
+EBV-Positive Diffuse Large B-Cell Lymphoma	NA	NA	NA
 Embryonal tumor with multilayer rosettes	MONDO_0016715	MONDO_0016715	NCIT_C129499
 Ependymoma	EFO_1000028	MONDO_0016698	NCIT_C3017
 Epstein-Barr virus-related tumor	MONDO_0017342	MONDO_0017342	NA
 Esophageal Carcinoma	EFO_0002916	MONDO_0019086	NCIT_C3513
 Ewing sarcoma	EFO_0000174	MONDO_0012817	NCIT_C4817
 Extraventricular neurocytoma	MONDO_0016727	MONDO_0016727	NCIT_C92555
 Fibromyxoid lesion	MONDO_0037745	MONDO_0037745	NCIT_C66760
+Follicular Variant Thyroid Gland Papillary Carcinoma	NA	NA	NA
 Ganglioglioma	EFO_0003094	MONDO_0016733	NCIT_C3788
 Ganglioneuroblastoma	EFO_0000502	MONDO_0005035	NCIT_C3790
 Ganglioneuroma	EFO_0000500	MONDO_0005033	NCIT_C3049
 Germ Cell Tumor	EFO_0000514	MONDO_0005040	NCIT_C3708
 Germinoma	MONDO_0020580	MONDO_0020580	NCIT_C121618
 Glial-neuronal tumor	MONDO_0016729	MONDO_0016729	NCIT_C4747
+Glial-neuronal tumor NOS	MONDO_0016729	MONDO_0016729	NCIT_C4747
 Glioblastoma	MONDO_0018177	MONDO_0018177	NCIT_C30587
 Glioblastoma Multiforme	EFO_0000519	MONDO_0018177	NCIT_C3058
 Head and Neck Squamous Cell Carcinoma	EFO_0000181	MONDO_0010150	NCIT_C34447
 Hemangioblastoma	MONDO_0016748	MONDO_0016748	NCIT_C3801
 Hepatoblastoma	EFO_1000292	MONDO_0018666	NCIT_C3728
 Hepatocellular Carcinoma	EFO_0000182	MONDO_0007256	NCIT_C3099
+Hepatocellular neoplasm NOS	NA	NA	NA
 High-grade glioma	MONDO_0100342	MONDO_0100342	NCIT_C4822
+High-grade neuroepithelial tumor	NA	NA	NA
 Histiocytic tumor	MONDO_0020081	MONDO_0020081	NCIT_C9294
 Hodgkin's lymphoma	EFO_0000183	MONDO_0004952	NCIT_C9357
 Infant-type hemispheric glioma	EFO_0005543	MONDO_0014695	NCIT_C185471
-Infantile Fibrosarcoma	MONDO_0002678	MONDO_0002678	NCIT_C4244
-Infantile hemispheric glioma	NA	NA	NA
 Inflammatory Myofibroblastic Tumor	MONDO_0015798	MONDO_0015798	NCIT_C6481
-Intrahepatic Cholangiocarcinoma	EFO_1001961	MONDO_0003210	NCIT_C35417
 Intraneural perineuroma	MONDO_0015032	MONDO_0015032	NCIT_C6911
 Juvenile xanthogranuloma	EFO_1000311	MONDO_0015534	NCIT_C3451
 Langerhans Cell histiocytosis	EFO_1000318	MONDO_0018310	NCIT_C3107
@@ -87,22 +84,22 @@ Mesenchymal tumor	EFO_1000473	MONDO_0003512	NCIT_C7059
 Mesothelioma	EFO_0000588	MONDO_0005065	NCIT_C3234
 Metastatic secondary tumors	EFO_0009812	MONDO_0024883	NCIT_C4968
 Mixed germ cell tumor	MONDO_0015864	MONDO_0015864	NCIT_C4290
+Myeloid Leukemia Associated with Down Syndrome	NA	NA	NA
+Myeloid Sarcoma	NA	NA	NA
 Neuroblastoma	EFO_0000621	MONDO_0005072	NCIT_C3270
-Neuroepithelial tumor with PATZ1 fusion	NA	NA	NA
 Neurofibroma/Plexiform	EFO_0000658	MONDO_0003304	NCIT_C3797
 Non-Hodgkin Lymphoma	EFO_0005952	MONDO_0018908	NCIT_C3211
-Non-germinomatous germ cell tumor	MONDO_0020580	MONDO_0020580	NCIT_C121619
 Oligodendroglioma	EFO_0000632	MONDO_0016695	NCIT_C3288
 Osteosarcoma	EFO_0000637	MONDO_0009807	NCIT_C9145
+Other tumor	NA	NA	NA
 Ovarian Serous Cystadenocarcinoma	EFO_1000043	MONDO_0006046	NCIT_C7978
 Pancreatic Adenocarcinoma	EFO_1000044	MONDO_0006047	NCIT_C8294
-Perineuroma	MONDO_0019404	MONDO_0019404	NCIT_C4973
+Pancreatoblastoma	NA	NA	NA
+Papillary Carcinoma	NA	NA	NA
 Pheochromocytoma and Paraganglioma	EFO_0020005	MONDO_0035540	NA
 Pilocytic astrocytoma	Orphanet_251612	MONDO_0016691	NCIT_C4047
 Pineoblastoma	EFO_1000475	MONDO_0016722	NCIT_C9344
-Pineocytoma	EFO_1000476	MONDO_0016723	NCIT_C6966
 Pleomorphic xanthoastrocytoma	MONDO_0016690	MONDO_0016690	NCIT_C4323
-Primary intracranial sarcoma	NA	NA	NA
 Primary mediastinal large B cell lymphoma	MONDO_0004021	MONDO_0020323	NCIT_C9280
 Prostate Adenocarcinoma	EFO_0000673	MONDO_0005082	NCIT_C2919
 Rectum Adenocarcinoma	EFO_0005631	MONDO_0002169	NCIT_C9383
@@ -115,15 +112,18 @@ Rosai-Dorfman disease	MONDO_0006412	MONDO_0006412	NCIT_C36075
 Rosette-forming glioneuronal tumor	MONDO_0016736	MONDO_0016736	NCIT_C129431
 Sarcoma	EFO_0000691	MONDO_0005089	NCIT_C9118
 Schwannoma	EFO_0000693	MONDO_0002546	NCIT_C3269
+Small Cell Carcinoma	NA	NA	NA
+Spindle cell neoplasm	NA	NA	NA
 Stomach Adenocarcinoma	EFO_0000503	MONDO_0005036	NCIT_C4004
 Subependymal Giant Cell Astrocytoma	MONDO_0016693	MONDO_0016693	NCIT_C3696
 T Acute Lymphoblastic Leukemia/Lymphoma	EFO_0000209	MONDO_0004963	NCIT_C3183
 Teratoma	MONDO_0002601	MONDO_0002601	NCIT_C3403
 Testicular Germ Cell Tumor	EFO_1000566	MONDO_0010108	NCIT_C8591
 Thymoma	EFO_1000581	MONDO_0006456	NCIT_C3411
 Thyroid Carcinoma	EFO_0002892	MONDO_0015075	NCIT_C4815
-Thyroid Gland Follicular Carcinoma	EFO_0000501	MONDO_0005034	NCIT_C8054
 Thyroid Gland Papillary Carcinoma	EFO_0000641	MONDO_0005075	NCIT_C4035
+Thyroid gland neoplasm	NA	NA	NA
+Type I Pleuropulmonary Blastoma	NA	NA	NA
 Uterine Carcinosarcoma	EFO_1000613	MONDO_0006485	NCIT_C42700
 Uterine Corpus Endometrial Carcinoma	EFO_0007532	MONDO_0000553	NCIT_C159413
 Uveal Melanoma	EFO_1000616	MONDO_0006486	NCIT_C7712

diff --git a/analyses/fusion-summary/01-fusion-summary.Rmd b/analyses/fusion-summary/01-fusion-summary.Rmd
@@ -62,10 +62,10 @@ prepareOutput <- function(fuseDF, bioid) {
   fuseDF %>% 
     # some fusions have in-frame and frameshift fusion calls for a sample
     # this will make unique fusionName and Sample dataset to get 1/0 values
-    dplyr::select(Sample,FusionName) %>%
-    unique() %>%
-    reshape2::dcast(Sample ~ FusionName,fun.aggregate = length) %>%
-    right_join(data.frame(Sample = bioid)) %>%
+    distinct(Sample, FusionName) %>%
+    mutate(Count = 1) %>%
+    pivot_wider(names_from = FusionName, values_from = Count, values_fill = list(Count = 0)) %>%
+    right_join(data.frame(Sample = specimensUnion)) %>%
     replace(is.na(.), 0) %>%
     rename(Kids_First_Biospecimen_ID = Sample)
 }