various updates including GSEA

noriakis · Jan 23, 2024 · 1032b2f · 1032b2f
1 parent dbc272f
commit 1032b2f
Show file tree

Hide file tree

Showing 15 changed files with 457 additions and 12 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -8,5 +8,5 @@ Authors@R: person("Noriaki", "Sato", email = "[email protected]", role = c("cre", "aut
 Depends: ggplot2, ggstar, ggraph, igraph
 Imports: GetoptLong, BiocFileCache, RCurl, igraph, vegan, methods, data.table, phangorn, RColorBrewer, ggtree, circlize, ComplexHeatmap, ggkegg, ape, dplyr, exactRankTests, ggblend, ggh4x, scales, tidygraph, ggplotify, ggtreeExtra, ggnewscale, scico
 Suggests: simplifyEnrichment, stringr, tidyr, Boruta, knitr, ggrepel
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.0
 VignetteBuilder: knitr
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,6 +3,7 @@
 export(alleleStat)
 export(anno_PATRIC_keywords)
 export(anno_eggNOG_keywords)
+export(calcKO)
 export(changeColors)
 export(checkEGGNOG)
 export(checkPATRIC)
@@ -18,6 +19,7 @@ export(consensusSeqMIDAS2)
 export(consensusSeqMIDAS2Slow)
 export(doAdonis)
 export(doBoruta)
+export(doGSEA)
 export(doL0)
 export(drawEGGNOG)
 export(drawPATRIC)
@@ -32,7 +34,9 @@ export(getTree)
 export(loadInStrain)
 export(loadMIDAS)
 export(loadMIDAS2)
+export(loadMIDAS2Legacy)
 export(loadmetaSNV)
+export(obtain_positions)
 export(plotCirclize)
 export(plotCoverage)
 export(plotGenes)
@@ -42,6 +46,7 @@ export(plotMAF)
 export(plotPCA)
 export(plotTree)
 export(returnGenes)
+export(reverse_annot)
 export(setAnnotation)
 export(setGroup)
 export(setMetadata)

diff --git a/R/checkEGGNOG.R b/R/checkEGGNOG.R
@@ -206,19 +206,21 @@ checkEGGNOG <- function(annot_file, ret="all", checkIDs=NULL, fill=TRUE) {
 #' @param sp candidate species
 #' @param anno annotation tibble obtained by checkEGGNOG
 #' @param how summarising function, default to mean
+#' @param verbose output messages
 #' @return data.frame
 #' @export
 #' 
-summariseAbundance <- function(stana, sp, anno, how=mean) {
+summariseAbundance <- function(stana, sp, anno, how=mean, verbose=FALSE) {
   geneDf <- stana@genes[[sp]]
-  merged <- list()
   annoval <- anno$value |> unique()
-  for (i in annoval) {
+  merged <-  lapply(annoval, function(i) {
     candID <- (anno |> dplyr::filter(anno$value==i))$ID
     ints <- intersect(row.names(geneDf), candID)
     if (length(ints)>0) {
-      merged[[i]] <- apply(geneDf[ints,], 2, how)
+      return(apply(geneDf[ints,], 2, how))
+    } else {
+      return(NULL)
     }
-  }
+  })
   do.call(rbind, merged)
 }
diff --git a/R/consensusSeqFast.R b/R/consensusSeqFast.R
@@ -44,7 +44,7 @@ consensusSeqMIDAS2 <- function(
     rand_samples=NULL,
     return_mat=FALSE,
     verbose=FALSE) {
-    ## site-list is currently not supported.
+
     if (is.null(species)) {species <- stana@clearSnps}
     if (length(species)==0) {stop("No species available")}
 	files <- c("depth","info")

diff --git a/R/doGSEA.R b/R/doGSEA.R
@@ -0,0 +1,68 @@
+#' doGSEA
+#' @export
+doGSEA <- function(stana, candSp=NULL, cl=NULL, eps=1e-2, how=mean) {
+    if (is.null(candSp)) {candSp <- stana@ids[1]}
+    if (is.null(cl)) {cl <- stana@cl}
+    if (length(cl)!=2) {stop("Only the two group is supported")}
+
+    aa <- cl[[1]]
+    bb <- cl[[2]]
+
+    geneMat <- stana@genes[[candSp]]
+    inSample <- colnames(geneMat)
+    if (length(intersect(inSample, aa))==0) {stop("No sample available")}
+    if (length(intersect(inSample, bb))==0) {stop("No sample available")}
+
+    if (is.null(stana@kos[[candSp]])) {
+        cat("KO abundance not found, calculating based on annotation ...\n")
+        ko_df_filt <- summariseAbundance(stana, sp = candSp,
+            checkEGGNOG(annot_file=stana@eggNOG[[candSp]], "KEGG_ko"),
+            how=how)
+    } else {
+        ko_df_filt <- stana@kos[[candSp]]
+    }
+
+    ## eps values
+    ko_df_filt <- ko_df_filt + 1e-2
+    ko_sum <- log2(apply(ko_df_filt[,intersect(inSample, aa)],1,mean) /
+                   apply(ko_df_filt[,intersect(inSample, bb)],1,mean))
+    ## Perform GSEA (it will take time)?
+    ko_sum <- ko_sum[order(ko_sum, decreasing=TRUE)]
+
+    ## KO to PATHWAY mapping
+    url <- "https://rest.kegg.jp/link/ko/pathway"
+    kopgsea <- data.frame(data.table::fread(url, header = FALSE, sep = "\t"))
+    kopgsea <- kopgsea |> filter(startsWith(V1, "path:ko"))
+    kopgsea$V1 <- kopgsea$V1 |> strsplit(":") |>
+      vapply("[",2,FUN.VALUE="a")
+    enr <- clusterProfiler::GSEA(ko_sum,
+        TERM2GENE = kopgsea, pvalueCutoff=1)
+    return(enr)
+}
+
+#' calcKO
+#' @export
+calcKO <- function(stana, candSp=NULL, how=mean) {
+    if (is.null(candSp)) {candSp <- stana@ids[1]}
+    ko_df_filt <- summariseAbundance(stana, sp = candSp,
+        checkEGGNOG(annot_file=stana@eggNOG[[candSp]], "KEGG_ko"),
+        how=how)
+    stana@kos[[candSp]] <- ko_df_filt
+    stana
+}
+
+#' reverse_annot
+#' @export
+reverse_annot <- function(stana, candSp, candidate, col="KEGG_ko") {
+    annot <- checkEGGNOG(annot_file=stana@eggNOG[[candSp]], col)
+    annot %>% filter(.data[["value"]] %in% candidate) %>%
+        pull(ID) %>% unique()
+}
+
+
+#' obtain_positions
+#' @export
+obtain_positions <- function(stana, candSp, geneID) {
+    tmp <- stana@snpsInfo[[candSp]]
+    tmp[tmp$gene_id %in% geneID, ] %>% row.names()
+}
diff --git a/R/loadMIDAS2.R b/R/loadMIDAS2.R
@@ -1,6 +1,6 @@
-#' loadMIDAS2
+#' loadMIDAS2Legacy
 #' 
-#' load the MIDAS2 merge command output.
+#' load the MIDAS2 merge command output (outdated).
 #' The location to lz4 binary must be added to PATH.
 #' Taxonomy table can be loaded from downloaded MIDAS2 db directory (metadata.tsv).
 #' If provided, additionally show tax names.
@@ -21,7 +21,7 @@
 #' @param only_stat only samples per species is returned (snpStat and geneStat)
 #' @export
 #' 
-loadMIDAS2 <- function(midas_merge_dir,
+loadMIDAS2Legacy <- function(midas_merge_dir,
                         cl=NULL,
                         filtNum=2,
                         db="gtdb",