diff --git a/DESCRIPTION b/DESCRIPTION index d00d34c..4ba80a1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -27,7 +27,8 @@ Imports: RColorBrewer, Rhtslib, survival, - DNAcopy + DNAcopy, + pheatmap Suggests: berryFunctions, Biostrings, @@ -43,7 +44,6 @@ Suggests: RaggedExperiment, rmarkdown, S4Vectors, - pheatmap, curl LinkingTo: Rhtslib, diff --git a/NAMESPACE b/NAMESPACE index 6824b16..7f04222 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -62,6 +62,7 @@ export(rainfallPlot) export(read.maf) export(readGistic) export(sampleSwaps) +export(segSummarize) export(segmentLogR) export(setdiffMAF) export(signatureEnrichment) diff --git a/R/segSummarize.R b/R/segSummarize.R new file mode 100644 index 0000000..3a2262c --- /dev/null +++ b/R/segSummarize.R @@ -0,0 +1,177 @@ +#' Summarize CBS segmentation results +#' @details +#' A handy function to summarize CBS segmentation results. Takes segmentation results generated by DNAcopy package \code{\link{segment}} and summarizes the CN for each cytoband and chromosomal arms. +#' +#' @param seg segmentation results generated from \code{DNAcopy} package \code{\link{segment}}. Input should be a multi-sample segmentation file or a data.frame. First six columns should correspond to sample name, chromosome, start, end, Num_Probes, Segment_Mean in log2 scale. (default output format from DNAcopy) +#' @param build genome build. Default hg19. Can be hg19, hg38. If other than these, use `cytoband` argument +#' @param cytoband cytoband data from UCSC genome browser. Only needed if `build` is other than `hg19` or `hg38` +#' @param thr threshold to call amplification and deletion. Any cytobands or chromosomal arms with median logR above or below this will be called. Default 0.3 +#' @param verbose Default TRUE +#' @param maf optional MAF +#' @param genes Add mutation status of these genes as an annotation to the heatmap +#' @param topanno annotation for each sample. This is passed as an input to `annotation_col` of `pheatmap` +#' @param topannocols annotation cols for `topanno`. This is passed as an input to `annotation_colors` of `pheatmap` +#' @return List of median CN values for each cytoband and chromosomal arm along with the plotting matrix +#' @export +#' @examples +#' laml.seg <- system.file("extdata", "LAML_CBS_segments.tsv.gz", package = "maftools") +#' segSummarize(seg = laml.seg) +#' +#' #Heighlight some genes as annotation +#' laml.maf = system.file("extdata", "tcga_laml.maf.gz", package = "maftools") #MAF file +#' laml.clin = system.file('extdata', 'tcga_laml_annot.tsv', package = 'maftools') #clinical data +#' laml = read.maf(maf = laml.maf, clinicalData = laml.clin) +#' +#' segSummarize(seg = laml.seg, maf = laml, genes = c("FLT3", "DNMT3A")) + +segSummarize = function(seg = NULL, build = "hg19", cytoband = NULL, thr = 0.3, verbose = TRUE, maf = NULL, genes = NULL, topanno = NULL, topannocols = NA){ + + if(is.null(cytoband)){ + build = match.arg(arg = build, choices = c("hg19", "hg38")) + cytoband <- system.file("extdata", paste0(build, "_cytobands.tsv.gz"), package = "maftools") + if(!file.exists(cytoband)){ + stop("Cytoband file does not exist!") + } + } + + if (is(object = seg, class2 = "data.frame")) { + seg = data.table::as.data.table(seg) + }else { + seg = data.table::fread(input = seg) + } + colnames(seg)[1:6] = c("Sample", "Chromosome", "Start", "End", "Num_Probes", "Segment_Mean") + seg$Chromosome = gsub(pattern = "^chr", replacement = "", x = seg$Chromosome) + data.table::setkey(x = seg, Chromosome, Start, End) + + if(nrow(seg[,.N,Sample]) < 3){ + stop("Not enough samples to proceed (N: ", nrow(seg[,.N,Sample]), ")") + } + + cy = data.table::fread(input = cytoband) + colnames(cy) = c("Chromosome", "Start", "End", "name", "stain") + cy$arm = substr(cy$name, 1, 1) + cy$Chromosome = gsub(pattern = "^chr", replacement = "", x = cy$Chromosome) + data.table::setkey(x = cy, Chromosome, Start, End) + + seg_events = lapply(split(seg, seg$Sample), function(x){ + .seg2arm(se = x, cy = cy, thr = thr) + }) + + cy[, id := paste(Chromosome, name, sep = "_")] + cy[, arm := paste(Chromosome, substr(name, 1, 1), sep = "_")] + + #Summarize CN per chromosomal cytoband + cytoband_events = lapply(seg_events, function(x) x$seg) |> data.table::rbindlist(idcol = "Tumor_Sample_Barcode") + cytoband_events = data.table::dcast(data = cytoband_events, formula = chromosome+arm ~ Tumor_Sample_Barcode, value.var = "cn") + + cytoband_events[, id := paste(chromosome, arm, sep = "_")] + cytoband_events$chromosome = NULL + cytoband_events$arm = NULL + data.table::setDF(x = cytoband_events, cytoband_events$id) + cytoband_events$id = NULL + cytoband_events = cytoband_events[cy[id %in% rownames(cytoband_events), id],,] + cytoband_events[cytoband_events > 4] = 4 + cytoband_events = cytoband_events[complete.cases(cytoband_events),] + + ##row anno + cn_band_anno = as.data.frame(cy[id %in% rownames(cytoband_events)]) + cn_band_anno = data.frame(row.names = cn_band_anno$id, chr = cn_band_anno$Chromosome, arm = substr(x = cn_band_anno$name, 1, 1)) + + #order by chromosome for human build + chr_ord = intersect(c(1:22, "X", "Y"), names(split(cn_band_anno, cn_band_anno$chr))) + cn_band_anno = cn_band_anno[unlist(lapply(split(cn_band_anno, cn_band_anno$chr)[chr_ord], rownames), use.names = FALSE), , drop = FALSE] + cytoband_events = cytoband_events[rownames(cn_band_anno),, drop = FALSE] + + ##Heatmap colors + cols_chromosome = setNames(object = rep(c("#95a5a6", "#7f8c8d"), length(unique(cn_band_anno$chr))), nm = unique(cn_band_anno$chr)) + cols_chromosome = cols_chromosome[unique(cn_band_anno$chr)] + cols_chromosome_arms = setNames(object = c("#ecf0f1", "#bdc3c7"), nm = c("p", "q")) + + gene2barcode = gene2barcode_cols = NA + show_annotation_legend = FALSE + + if(!is.null(maf)){ + if(!is.null(genes)){ + gene2barcode = genesToBarcodes(maf = maf, genes = genes, justNames = TRUE) + print(gene2barcode) + gene2barcode = lapply(gene2barcode, function(x) data.table::data.table(Tumor_Sample_Barcode = x, status = 'yes')) |> data.table::rbindlist(idcol = "Hugo_Symbol") |> data.table::dcast(formula = Tumor_Sample_Barcode ~ Hugo_Symbol, value.var = "status") + data.table::setDF(x = gene2barcode, rownames = gene2barcode$Tumor_Sample_Barcode) + gene2barcode = gene2barcode[, 2:ncol(gene2barcode), drop = FALSE] + gene2barcode_cols = lapply(X = colnames(gene2barcode), function(x) setNames(object = "#34495e", nm = "yes")) + names(gene2barcode_cols) = colnames(gene2barcode) + } + } + + if(!is.null(topanno)){ + if(!is.null(nrow(gene2barcode))){ + gene2barcode = merge(gene2barcode, topanno, by = "row.names", all = TRUE) + rownames(gene2barcode) = gene2barcode$`Row.names` + gene2barcode$`Row.names` = NULL + }else{ + gene2barcode = topanno + } + show_annotation_legend = TRUE + } + + pheatmap::pheatmap( + cytoband_events, + cluster_rows = FALSE, + cluster_cols = TRUE, + show_colnames = FALSE, + annotation_row = cn_band_anno, + color = grDevices::colorRampPalette(rev(brewer.pal( + n = 7, name = "PiYG" + )))(100), + annotation_colors = c(list( + chr = cols_chromosome, + arm = cols_chromosome_arms + ), gene2barcode_cols, topannocols), + show_rownames = FALSE, + border_color = 'black', annotation_legend = show_annotation_legend, legend_breaks = seq(0, 4, 1), legend_labels = c("0", "1", "2", "3", ">4"), annotation_col = gene2barcode, na_col = "gray" + ) + + #Arm events + arm_events = lapply(seg_events, function(x) x$arm) |> data.table::rbindlist(fill = TRUE, use.names = TRUE, idcol = "Tumor_Sample_Barcode") + arm_events = arm_events[!is.na(Variant_Classification)][order(chromosome)] + arm_events_n = arm_events[,.N,.(arm,Variant_Classification)][!is.na(Variant_Classification)][order(-N)] + + if(verbose){ + message("Recurrent chromosomal arm aberrations") + print(arm_events_n) + } + + #Focal band events + band_events = lapply(seg_events, function(x) x$seg) |> data.table::rbindlist(fill = TRUE, use.names = TRUE, idcol = "Tumor_Sample_Barcode") + band_events = band_events[!is.na(Variant_Classification)][order(chromosome)] + colnames(band_events)[3] = "cytoband" + band_events_n = band_events[,.N,.(chromosome, cytoband,Variant_Classification)][!is.na(Variant_Classification)][order(-N)] + + attr(cytoband_events, "meta") = list(top_anno = gene2barcode, top_anno_cols = gene2barcode_cols) + + list(heatmap_matrix = cytoband_events, arm_events = arm_events, band_events = band_events) +} + +.seg2arm = function (se, cy = NULL, thr = 0.3) { + + data.table::setkey(x = se, Chromosome, Start, End) + se_olaps = data.table::foverlaps(x = cy, y = se) + cytoband_mean = se_olaps[, median(Segment_Mean, na.rm = TRUE), + .(Chromosome, name)] + colnames(cytoband_mean) = c("chromosome", "arm", "logR") + cytoband_mean[, `:=`(cn, 2 * (2^logR))] + cytoband_mean$Variant_Classification = ifelse(cytoband_mean$logR > + thr, "Amp", no = ifelse(cytoband_mean$logR < -thr, yes = "Del", + no = NA)) + cytoband_mean = cytoband_mean[!chromosome %in% c("chrX", + "chrY")] + arm_mean = se_olaps[, median(Segment_Mean, na.rm = TRUE), + .(Chromosome, arm)] + colnames(arm_mean) = c("chromosome", "arm", "logR") + arm_mean[, `:=`(cn, 2 * (2^logR))] + arm_mean$Variant_Classification = ifelse(arm_mean$logR > + thr, "Gain", no = ifelse(arm_mean$logR < -thr, yes = "Loss", + no = NA)) + arm_mean$arm = paste(arm_mean$chromosome, arm_mean$arm, sep = "_") + arm_mean = arm_mean[!chromosome %in% c("chrX", "chrY")] + list(arm = arm_mean, seg = cytoband_mean) +} diff --git a/inst/extdata/LAML_CBS_segments.tsv.gz b/inst/extdata/LAML_CBS_segments.tsv.gz new file mode 100644 index 0000000..c206ed4 Binary files /dev/null and b/inst/extdata/LAML_CBS_segments.tsv.gz differ diff --git a/inst/extdata/hg19_cytobands.tsv.gz b/inst/extdata/hg19_cytobands.tsv.gz new file mode 100644 index 0000000..d431db0 Binary files /dev/null and b/inst/extdata/hg19_cytobands.tsv.gz differ diff --git a/inst/extdata/hg38_cytobands.tsv.gz b/inst/extdata/hg38_cytobands.tsv.gz new file mode 100644 index 0000000..1a6ab25 Binary files /dev/null and b/inst/extdata/hg38_cytobands.tsv.gz differ diff --git a/man/segSummarize.Rd b/man/segSummarize.Rd new file mode 100644 index 0000000..7a93c52 --- /dev/null +++ b/man/segSummarize.Rd @@ -0,0 +1,57 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/segSummarize.R +\name{segSummarize} +\alias{segSummarize} +\title{Summarize CBS segmentation results} +\usage{ +segSummarize( + seg = NULL, + build = "hg19", + cytoband = NULL, + thr = 0.3, + verbose = TRUE, + maf = NULL, + genes = NULL, + topanno = NULL, + topannocols = NA +) +} +\arguments{ +\item{seg}{segmentation results generated from \code{DNAcopy} package \code{\link{segment}}. Input should be a multi-sample segmentation file or a data.frame. First six columns should correspond to sample name, chromosome, start, end, Num_Probes, Segment_Mean in log2 scale. (default output format from DNAcopy)} + +\item{build}{genome build. Default hg19. Can be hg19, hg38. If other than these, use `cytoband` argument} + +\item{cytoband}{cytoband data from UCSC genome browser. Only needed if `build` is other than `hg19` or `hg38`} + +\item{thr}{threshold to call amplification and deletion. Any cytobands or chromosomal arms with median logR above or below this will be called. Default 0.3} + +\item{verbose}{Default TRUE} + +\item{maf}{optional MAF} + +\item{genes}{Add mutation status of these genes as an annotation to the heatmap} + +\item{topanno}{annotation for each sample. This is passed as an input to `annotation_col` of `pheatmap`} + +\item{topannocols}{annotation cols for `topanno`. This is passed as an input to `annotation_colors` of `pheatmap`} +} +\value{ +List of median CN values for each cytoband and chromosomal arm along with the plotting matrix +} +\description{ +Summarize CBS segmentation results +} +\details{ +A handy function to summarize CBS segmentation results. Takes segmentation results generated by DNAcopy package \code{\link{segment}} and summarizes the CN for each cytoband and chromosomal arms. +} +\examples{ +laml.seg <- system.file("extdata", "LAML_CBS_segments.tsv.gz", package = "maftools") +segSummarize(seg = laml.seg) + +#Heighlight some genes as annotation +laml.maf = system.file("extdata", "tcga_laml.maf.gz", package = "maftools") #MAF file +laml.clin = system.file('extdata', 'tcga_laml_annot.tsv', package = 'maftools') #clinical data +laml = read.maf(maf = laml.maf, clinicalData = laml.clin) + +segSummarize(seg = laml.seg, maf = laml, genes = c("FLT3", "DNMT3A")) +} diff --git a/vignettes/maftools.Rmd b/vignettes/maftools.Rmd index b59e48f..4f284f2 100644 --- a/vignettes/maftools.Rmd +++ b/vignettes/maftools.Rmd @@ -29,7 +29,7 @@ If you find this tool useful, please cite: ------------------------------------------------------------------------ -***Mayakonda A, Lin DC, Assenov Y, Plass C, Koeffler HP. 2018. Maftools: efficient and comprehensive analysis of somatic variants in cancer. [Genome Resarch. PMID: 30341162](https://doi.org/10.1101/gr.239244.118)*** +***Mayakonda A, Lin DC, Assenov Y, Plass C, Koeffler HP. 2018. Maftools: efficient and comprehensive analysis of somatic variants in cancer. [Genome Research. PMID: 30341162](https://doi.org/10.1101/gr.239244.118)*** ------------------------------------------------------------------------ @@ -279,6 +279,19 @@ This is similar to oncoplots except for copy number data. One can again sort the gisticOncoPlot(gistic = laml.gistic, clinicalData = getClinicalData(x = laml), clinicalFeatures = 'FAB_classification', sortByAnnotation = TRUE, top = 10) ``` +## CBS segments + +### Summarizing chromosomal arm level CN + +A multi-sample CBS segments file can be summarized to get the CN status of each cytoband and chromosomal arms. Below plot shows 5q deletions in a subset of AML samples. +The function returns the plot data and the CN status of each chromosomal cytoband and arms for every sample. + +```{r segSummarize} +laml.seg <- system.file("extdata", "LAML_CBS_segments.tsv.gz", package = "maftools") +segSummarize_results = segSummarize(seg = laml.seg) +``` + + ### Visualizing CBS segments ```{r plotCBSsegments, fig.height=2.5,fig.width=4,fig.align='left'}