Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EGFR in DMG #617

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions analyses/DMG_analysis/00-EGFR_maf_subset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
## subset maf file
library(tidyverse)

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")

hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))

## subset genes of interests
gene_of_interests <- c("H3-3A", "H3C2", "H3C3", "H3C14",
"EGFR", "TP53", "ATRX", "NF1")

tumor_only_maf <- data.table::fread(
file.path(data_dir, "snv-mutect2-tumor-only-plus-hotspots.maf.tsv.gz"),
data.table = FALSE) %>%
filter(Tumor_Sample_Barcode %in% selected_sample$Kids_First_Biospecimen_ID,
Hugo_Symbol %in% gene_of_interests)

# Add tumor only to consensus MAF
snv_consensus_hotspot_maf <- data.table::fread(
file.path(data_dir, "snv-consensus-plus-hotspots.maf.tsv.gz"),
data.table = FALSE) %>%
filter(Tumor_Sample_Barcode %in% selected_sample$Kids_First_Biospecimen_ID,
Hugo_Symbol %in% gene_of_interests)

tumor_only_maf$PUBMED <- as.character(tumor_only_maf$PUBMED)
tumor_only_maf$PHENO <- as.character(tumor_only_maf$PHENO)
tumor_only_maf$gnomad_3_1_1_AF_non_cancer <- as.character(tumor_only_maf$gnomad_3_1_1_AF_non_cancer)

snv_final <- snv_consensus_hotspot_maf %>%
dplyr::bind_rows(tumor_only_maf)

write_tsv(snv_final, file.path(subset_dir, "snv_selected_genes.maf.tsv"))
12 changes: 12 additions & 0 deletions analyses/DMG_analysis/00-onco_annotate.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash

set -e
set -o pipefail

python_path=(/usr/bin/python3)
maf_annotator=(/Users/gengz/Documents/GitHub/oncokb-annotator/MafAnnotator.py)
maf_in=(/Users/gengz/Documents/GitHub/OpenPedCan-analysis/analyses/DMG_analysis/subset/snv_selected_genes.maf.tsv)
maf_oncokb_out=(/Users/gengz/Documents/GitHub/OpenPedCan-analysis/analyses/DMG_analysis/subset/snv_selected_genes_oncokb.maf.tsv)

# Run maf_annotator on dgd samples
$python_path $maf_annotator -i $maf_in -o $maf_oncokb_out -b $ONCO_KB -q hgvsp_short -r GRCh38
74 changes: 74 additions & 0 deletions analyses/DMG_analysis/01-data preparation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
library(tidyverse)
library(reshape2)

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")


## read histologies
hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))

##snv file
snv_annotate <- read_tsv(file.path(subset_dir, "snv_selected_genes_oncokb.maf.tsv"))
snv_annotate_short <- snv_annotate %>%
mutate(MUTATION_EFFECT = case_when(Hugo_Symbol == "EGFR" &
HGVSp_Short %in% c("p.A289V", "p.A289T") ~
paste(MUTATION_EFFECT, "WHO mutation", sep = ";"),
TRUE ~ MUTATION_EFFECT)) %>%
select(Tumor_Sample_Barcode, MUTATION_EFFECT, Hugo_Symbol)

## cn files
cn_df <- read_tsv(file.path(data_dir, "consensus_wgs_plus_cnvkit_wxs_plus_freec_tumor_only.tsv.gz"))
cn_filtered <- cn_df %>%
filter(biospecimen_id %in% selected_sample$Kids_First_Biospecimen_ID,
gene_symbol == "EGFR") %>%
select(biospecimen_id, status, gene_symbol) %>%
dplyr::rename("Tumor_Sample_Barcode" = "biospecimen_id",
"MUTATION_EFFECT" = "status",
"Hugo_Symbol" = "gene_symbol") %>%
mutate(MUTATION_EFFECT = case_when(MUTATION_EFFECT == "amplification" ~ "amplification",
TRUE ~ ""))
rm(cn_df)

## combined cnv with snv
snv_onco <- cn_filtered %>%
bind_rows(snv_annotate_short) %>%
left_join(hist %>% select(Kids_First_Biospecimen_ID, match_id),
by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
select(match_id, MUTATION_EFFECT, Hugo_Symbol) %>%
unique() %>%
group_by(match_id, Hugo_Symbol) %>%
summarise(status = paste0(MUTATION_EFFECT, collapse = ";")) %>%
spread(Hugo_Symbol, status)

## RNA files
RNA <- read_rds(file.path(data_dir, "gene-expression-rsem-tpm-collapsed.rds"))
col_rna <- selected_sample %>% filter(experimental_strategy == "RNA-Seq") %>% select(Kids_First_Biospecimen_ID, match_id)

select_RNA <- RNA[rownames(RNA) %in% c("EZHIP", "EGFR"), col_rna %>% pull(Kids_First_Biospecimen_ID)]
select_RNA <- as.data.frame(scale(t(select_RNA)))
select_RNA$Kids_First_Biospecimen_ID <- rownames(select_RNA)

select_RNA <- select_RNA %>%
mutate(EGFR_exp = case_when(EGFR >=3 ~ "overexpression",
TRUE ~ NA),
EZHIP_exp = case_when(EZHIP >=3 ~ "overexpression",
TRUE ~ NA)) %>%
left_join(selected_sample %>% select(Kids_First_Biospecimen_ID, match_id)) %>%
select(match_id, EGFR_exp, EZHIP_exp) %>%
unique()

## gather all datas into one table
final_df <- selected_sample %>%
select(match_id, age_at_diagnosis_days, reported_gender, molecular_subtype, tumor_descriptor) %>%
unique() %>%
left_join(snv_onco) %>%
left_join(select_RNA) %>%
mutate(reported_gender = case_when(reported_gender == "Not Reported" ~ "Unknown", TRUE ~ reported_gender)) %>%
write_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))

89 changes: 89 additions & 0 deletions analyses/DMG_analysis/02-oncoplot.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
---
title: "oncoplot_DMG"
author: "ZZ"
date: "2024-01-16"
output: pdf_document
---

## load libraries

```{r libraries}
library(tidyverse)
library(ComplexHeatmap)
library(reshape2)
library(circlize)
```


## set directories and read file
```{r}
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")

onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
```

oncoplot

```{r}

colname_annotate <- c("match_id", "reported_gender", "tumor_descriptor", "molecular_subtype","EGFR_exp", "EZHIP_exp")

annotate_info <- as.data.frame(onco_df[,colname_annotate])
rownames(annotate_info) <- annotate_info$match_id
annotate_info <- annotate_info[,-1]

ha = HeatmapAnnotation(df = annotate_info, col = list(reported_gender = c("Male" = "#0707CF",
"Female" = "#CC0303",
"Unknown" = "white"),
molecular_subtype = c("DMG, H3 K28" = "red",
"DMG, H3 K28, TP53" = "green")))

mat <- as.matrix(onco_df[,-c(2,3,4,5,14,15)])
mat[is.na(mat)] = ""
rownames(mat) = mat[, 1]
mat = mat[, -1]
mat = t(as.matrix(mat))

alter_fun = list(
background = function(x, y, w, h) {
grid.rect(x, y, w, h, gp = gpar(fill = "#ffffff",col= "#ffffff"))
},
`Likely Gain-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#ff4d4d", col = NA))
},
`Likely Loss-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#0D47A1", col = NA))
},
Unknown = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "grey", col = NA))
},
`Loss-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#80bfff", col = NA))
},
Inconclusive = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#8D6E63", col = NA))
},
`Gain-of-function` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#9966ff", col = NA))
},
`WHO mutation` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#E69F00", col = NA))
},
`amplification` = function(x, y, w, h) {
grid.rect(x, y, w-unit(0.3, "mm"), h-unit(0.3, "mm"), gp = gpar(fill = "#827717", col = NA))
}
)

oncoplot <- oncoPrint(mat, get_type = function(x)strsplit(x, ";")[[1]],
alter_fun = alter_fun, top_annotation = ha)

pdf(file = "oncoplot.pdf", width = 16, height = 8)
draw(oncoplot)
dev.off()


```


78 changes: 78 additions & 0 deletions analyses/DMG_analysis/03-gsva.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
library(GSVA)

## set directories
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")


## read histologies
hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))
onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))

## RNA
RNA <- read_rds(file.path(data_dir, "gene-expression-rsem-tpm-collapsed.rds"))
col_rna <- selected_sample %>%
filter(experimental_strategy == "RNA-Seq") %>%
select(Kids_First_Biospecimen_ID, match_id)
select_RNA_all <- RNA[, col_rna %>% pull(Kids_First_Biospecimen_ID)]
log_rna <- as.matrix(log2(select_RNA_all + 1))

## Hallmark gene
human_hallmark <- msigdbr::msigdbr(species = "Homo sapiens",
category = "H")

human_hallmark_twocols <- human_hallmark %>%
select(gs_name, gene_symbol)

human_hallmark_list <- base::split(
human_hallmark_twocols$gene_symbol,
list(human_hallmark_twocols$gs_name))

## annotation
col_annotate <- col_rna %>%
left_join(onco_df %>% select(match_id, EGFR)) %>%
mutate(EGFR_mut = case_when(!is.na(EGFR) ~ "EGFR mutated", TRUE ~ "EGFR unmutated")) %>%
select(-match_id)

RNA_EGFR_un <- as.matrix(log_rna[, col_annotate %>% filter(EGFR_mut == "EGFR unmutated") %>% pull(Kids_First_Biospecimen_ID)])
RNA_EGFR <- as.matrix(log_rna[, col_annotate %>% filter(EGFR_mut == "EGFR mutated") %>% pull(Kids_First_Biospecimen_ID)])

## GSVA for all samples
Hallmark_gsea_score_all <- GSVA::gsva(expr = log_rna,
gset.idx.list = human_hallmark_list,
method = "gsva",
min.sz = 1, max.sz = 1500,
parallel.sz = 8,
mx.diff = TRUE)

colAnn <- ComplexHeatmap::HeatmapAnnotation(EGFR_mut = col_annotete$EGFR_mut, which = "col")
hm <- ComplexHeatmap::Heatmap(Hallmark_gsea_score_all,
show_column_names = FALSE, top_annotation = colAnn,
row_names_gp = grid::gpar(fontsize = 5.8),
name = "GSVA score")

pdf("GSVA.pdf", height = 10, width = 15)
ComplexHeatmap::draw(hm)
dev.off()

## GSVA on EGFR mutated and unmutated
Hallmark_gsea_score <- GSVA::gsva(expr = RNA_EGFR,
gset.idx.list = human_hallmark_list,
method = "gsva",
min.sz = 1, max.sz = 1500,
parallel.sz = 8,
mx.diff = TRUE)

Hallmark_gsea_score_un <- GSVA::gsva(expr = RNA_EGFR_un,
gset.idx.list = human_hallmark_list,
method = "gsva",
min.sz = 1, max.sz = 1500,
parallel.sz = 8,
mx.diff = TRUE)

ComplexHeatmap::Heatmap(Hallmark_gsea_score)
ComplexHeatmap::Heatmap(Hallmark_gsea_score_un)
99 changes: 99 additions & 0 deletions analyses/DMG_analysis/04-summary_table.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
---
title: "summary_DMG"
author: "ZZ"
date: "2024-01-15"
output: pdf_document
---

## libraries
```{r}
library(tidyverse)

```

## set directories
```{r}
root_dir <- rprojroot::find_root(rprojroot::has_dir(".git"))
data_dir <- file.path(root_dir, "data", "v13")
subset_dir <- file.path(root_dir, "analyses", "DMG_analysis", "subset")
result_dir <- file.path(root_dir, "analyses", "DMG_analysis", "results")

```

## load files

```{r}
hist <- read_tsv(file.path(data_dir, "histologies.tsv"))
selected_sample <- hist %>%
filter(grepl(c("DMG"), molecular_subtype))
onco_df <- read_tsv(file.path(result_dir, "df_for_oncoplot.tsv"))
sv <- data.table::fread(file.path(data_dir, "sv-manta.tsv.gz"))
tp53 <- read_tsv(file.path(root_dir, "analyses", "tp53_nf1_score", "results", "tp53_altered_status.tsv"))

```
### subset files
```{r}
tp53_select <- tp53 %>%
filter(match_id %in% selected_sample$match_id) %>%
select(match_id, tp53_score, tp53_altered) %>%
mutate(tp53_alteration = case_when(tp53_altered %in% c("activated", "loss") ~ "Y", TRUE ~ "N")) %>%
unique()

sv_selected <- sv %>%
filter(Kids.First.Biospecimen.ID.Tumor %in% selected_sample$Kids_First_Biospecimen_ID,
Gene_name == "EGFR") %>%
select(Gene_name, Kids.First.Biospecimen.ID.Tumor, SV_type) %>%
dplyr::rename("Kids_First_Biospecimen_ID" = "Kids.First.Biospecimen.ID.Tumor") %>%
left_join(selected_sample %>% select(Kids_First_Biospecimen_ID, match_id)) %>%
mutate(EGFR_ins = case_when(SV_type == "INS" ~ "Y",
TRUE ~ "N")) %>%
select(match_id, EGFR_ins, SV_type) %>%
unique()

rm(sv)
```

### transform onco file

```{r}

snv_annotate <- read_tsv(file.path(subset_dir, "snv_selected_genes_oncokb.maf.tsv"))
snv_annotate_1 <- snv_annotate %>%
left_join(selected_sample %>% select(match_id, Kids_First_Biospecimen_ID),
by = c("Tumor_Sample_Barcode" = "Kids_First_Biospecimen_ID")) %>%
mutate(H3.1_K28M = case_when(Hugo_Symbol == "H3-3A" & HGVSp_Short == "p.K28M" ~ "Y", TRUE ~ "N"),
H3.3_K28M = case_when(Hugo_Symbol %in% c("H3C2", "H3C3") & HGVSp_Short == "p.K28M" ~ "Y", TRUE ~ "N"),
EGFR_ins = case_when(grepl("Ins", Variant_Classification) & Hugo_Symbol == "EGFR" ~ "Y", TRUE ~ "N"),
EGFR_onco = case_when(Hugo_Symbol == "EGFR" & ONCOGENIC == "Oncogenic" ~ "Y", TRUE ~ "N"),
EGFR_WHO_reported = case_when(Hugo_Symbol == "EGFR" & HGVSp_Short %in% c("p.A289T", "p.A289V") ~ "Y", TRUE ~ "N"),
ATRX_onco = case_when(Hugo_Symbol == "ATRX" & ONCOGENIC == "Oncogenic" ~ "Y", TRUE ~ "N")) %>%
select(match_id, H3.1_K28M, H3.3_K28M, EGFR_ins, EGFR_onco, EGFR_WHO_reported, ATRX_onco) %>%
filter(rowSums(is.na(.)) != 6) %>%
group_by(match_id) %>%
summarize_at(vars(H3.1_K28M, H3.3_K28M, EGFR_ins, EGFR_onco, EGFR_WHO_reported, ATRX_onco),
funs(sum = paste(unique(.), collapse = ";"))) %>%
unique()



```

```{r}

summary_df <- onco_df %>%
mutate(H3_WT = case_when(is.na(`H3-3A`) & is.na(H3C14) & is.na(H3C2) & is.na(H3C3) ~ "Y", TRUE ~ "N"),
EGFR_overexp = case_when(EGFR_exp == "overexpression" ~ "Y", TRUE ~ "N"),
EZHIP_overexp = case_when(EZHIP_exp == "overexpression" ~ "Y", TRUE ~ "N"),
EGFR_amp = case_when(grepl("amplification", EGFR) ~ "Y", TRUE ~ "N")) %>%
select(match_id, H3_WT, EGFR_overexp, EZHIP_overexp, EGFR_amp) %>%
left_join(tp53_select) %>%
left_join(snv_annotate_1) %>%
left_join(sv_selected) %>%
mutate(H3.1_K28M_sum = case_when(H3.1_K28M_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ H3.1_K28M_sum),
H3.3_K28M_sum = case_when(H3.3_K28M_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ H3.3_K28M_sum),
EGFR_ins_sum = case_when(EGFR_ins_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_ins_sum),
EGFR_onco_sum = case_when(EGFR_onco_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_onco_sum),
EGFR_WHO_reported_sum = case_when(EGFR_WHO_reported_sum %in% c("Y;N", "N;Y") ~ "Y", TRUE ~ EGFR_WHO_reported_sum))

write_tsv(summary_df, file.path(result_dir, "summary_table.tsv"))
```
Loading
Loading