diff --git a/DESCRIPTION b/DESCRIPTION index f8a0772..4309cb7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -8,5 +8,5 @@ Authors@R: person("Noriaki", "Sato", email = "nori@hgc.jp", role = c("cre", "aut Depends: ggplot2, ggstar, ggraph, igraph Imports: GetoptLong, BiocFileCache, RCurl, igraph, vegan, methods, data.table, phangorn, RColorBrewer, ggtree, circlize, ComplexHeatmap, ggkegg, ape, dplyr, exactRankTests, ggblend, ggh4x, scales, tidygraph, ggplotify, ggtreeExtra, ggnewscale, scico Suggests: simplifyEnrichment, stringr, tidyr, Boruta, knitr, ggrepel -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.0 VignetteBuilder: knitr diff --git a/NAMESPACE b/NAMESPACE index 2738531..f834807 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -3,6 +3,7 @@ export(alleleStat) export(anno_PATRIC_keywords) export(anno_eggNOG_keywords) +export(calcKO) export(changeColors) export(checkEGGNOG) export(checkPATRIC) @@ -18,6 +19,7 @@ export(consensusSeqMIDAS2) export(consensusSeqMIDAS2Slow) export(doAdonis) export(doBoruta) +export(doGSEA) export(doL0) export(drawEGGNOG) export(drawPATRIC) @@ -32,7 +34,9 @@ export(getTree) export(loadInStrain) export(loadMIDAS) export(loadMIDAS2) +export(loadMIDAS2Legacy) export(loadmetaSNV) +export(obtain_positions) export(plotCirclize) export(plotCoverage) export(plotGenes) @@ -42,6 +46,7 @@ export(plotMAF) export(plotPCA) export(plotTree) export(returnGenes) +export(reverse_annot) export(setAnnotation) export(setGroup) export(setMetadata) diff --git a/R/checkEGGNOG.R b/R/checkEGGNOG.R index 8e88801..47df030 100644 --- a/R/checkEGGNOG.R +++ b/R/checkEGGNOG.R @@ -206,19 +206,21 @@ checkEGGNOG <- function(annot_file, ret="all", checkIDs=NULL, fill=TRUE) { #' @param sp candidate species #' @param anno annotation tibble obtained by checkEGGNOG #' @param how summarising function, default to mean +#' @param verbose output messages #' @return data.frame #' @export #' -summariseAbundance <- function(stana, sp, anno, how=mean) { +summariseAbundance <- function(stana, sp, anno, how=mean, verbose=FALSE) { geneDf <- stana@genes[[sp]] - merged <- list() annoval <- anno$value |> unique() - for (i in annoval) { + merged <- lapply(annoval, function(i) { candID <- (anno |> dplyr::filter(anno$value==i))$ID ints <- intersect(row.names(geneDf), candID) if (length(ints)>0) { - merged[[i]] <- apply(geneDf[ints,], 2, how) + return(apply(geneDf[ints,], 2, how)) + } else { + return(NULL) } - } + }) do.call(rbind, merged) } \ No newline at end of file diff --git a/R/consensusSeqFast.R b/R/consensusSeqFast.R index 62efcb3..ab933bc 100644 --- a/R/consensusSeqFast.R +++ b/R/consensusSeqFast.R @@ -44,7 +44,7 @@ consensusSeqMIDAS2 <- function( rand_samples=NULL, return_mat=FALSE, verbose=FALSE) { - ## site-list is currently not supported. + if (is.null(species)) {species <- stana@clearSnps} if (length(species)==0) {stop("No species available")} files <- c("depth","info") diff --git a/R/doGSEA.R b/R/doGSEA.R new file mode 100644 index 0000000..89d1f4c --- /dev/null +++ b/R/doGSEA.R @@ -0,0 +1,68 @@ +#' doGSEA +#' @export +doGSEA <- function(stana, candSp=NULL, cl=NULL, eps=1e-2, how=mean) { + if (is.null(candSp)) {candSp <- stana@ids[1]} + if (is.null(cl)) {cl <- stana@cl} + if (length(cl)!=2) {stop("Only the two group is supported")} + + aa <- cl[[1]] + bb <- cl[[2]] + + geneMat <- stana@genes[[candSp]] + inSample <- colnames(geneMat) + if (length(intersect(inSample, aa))==0) {stop("No sample available")} + if (length(intersect(inSample, bb))==0) {stop("No sample available")} + + if (is.null(stana@kos[[candSp]])) { + cat("KO abundance not found, calculating based on annotation ...\n") + ko_df_filt <- summariseAbundance(stana, sp = candSp, + checkEGGNOG(annot_file=stana@eggNOG[[candSp]], "KEGG_ko"), + how=how) + } else { + ko_df_filt <- stana@kos[[candSp]] + } + + ## eps values + ko_df_filt <- ko_df_filt + 1e-2 + ko_sum <- log2(apply(ko_df_filt[,intersect(inSample, aa)],1,mean) / + apply(ko_df_filt[,intersect(inSample, bb)],1,mean)) + ## Perform GSEA (it will take time)? + ko_sum <- ko_sum[order(ko_sum, decreasing=TRUE)] + + ## KO to PATHWAY mapping + url <- "https://rest.kegg.jp/link/ko/pathway" + kopgsea <- data.frame(data.table::fread(url, header = FALSE, sep = "\t")) + kopgsea <- kopgsea |> filter(startsWith(V1, "path:ko")) + kopgsea$V1 <- kopgsea$V1 |> strsplit(":") |> + vapply("[",2,FUN.VALUE="a") + enr <- clusterProfiler::GSEA(ko_sum, + TERM2GENE = kopgsea, pvalueCutoff=1) + return(enr) +} + +#' calcKO +#' @export +calcKO <- function(stana, candSp=NULL, how=mean) { + if (is.null(candSp)) {candSp <- stana@ids[1]} + ko_df_filt <- summariseAbundance(stana, sp = candSp, + checkEGGNOG(annot_file=stana@eggNOG[[candSp]], "KEGG_ko"), + how=how) + stana@kos[[candSp]] <- ko_df_filt + stana +} + +#' reverse_annot +#' @export +reverse_annot <- function(stana, candSp, candidate, col="KEGG_ko") { + annot <- checkEGGNOG(annot_file=stana@eggNOG[[candSp]], col) + annot %>% filter(.data[["value"]] %in% candidate) %>% + pull(ID) %>% unique() +} + + +#' obtain_positions +#' @export +obtain_positions <- function(stana, candSp, geneID) { + tmp <- stana@snpsInfo[[candSp]] + tmp[tmp$gene_id %in% geneID, ] %>% row.names() +} \ No newline at end of file diff --git a/R/loadMIDAS2.R b/R/loadMIDAS2.R index cf06a51..1451c40 100644 --- a/R/loadMIDAS2.R +++ b/R/loadMIDAS2.R @@ -1,6 +1,6 @@ -#' loadMIDAS2 +#' loadMIDAS2Legacy #' -#' load the MIDAS2 merge command output. +#' load the MIDAS2 merge command output (outdated). #' The location to lz4 binary must be added to PATH. #' Taxonomy table can be loaded from downloaded MIDAS2 db directory (metadata.tsv). #' If provided, additionally show tax names. @@ -21,7 +21,7 @@ #' @param only_stat only samples per species is returned (snpStat and geneStat) #' @export #' -loadMIDAS2 <- function(midas_merge_dir, +loadMIDAS2Legacy <- function(midas_merge_dir, cl=NULL, filtNum=2, db="gtdb", diff --git a/R/loadMIDAS2Refine.R b/R/loadMIDAS2Refine.R new file mode 100644 index 0000000..e7973ac --- /dev/null +++ b/R/loadMIDAS2Refine.R @@ -0,0 +1,264 @@ +#' loadMIDAS2 +#' +#' load the MIDAS2 merge command output. +#' The location to lz4 binary must be added to PATH. +#' Taxonomy table can be loaded from downloaded MIDAS2 db directory (metadata.tsv). +#' If provided, additionally show tax names. +#' +#' @param midas_merge_dir path to merged directory +#' @param cl named list of category for samples +#' @param filtBy filter by {snps} number or {genes} number (default to snps) +#' @param filtNum the species with the samples above this number will be returned +#' @param filtPer filter by fraction +#' @param db data base that was used to profile, default to gtdb. +#' @param candSp candidate species ID +#' @param taxtbl tax table, row.names: 6-digits MIDAS2 ID and `GTDB species` (gtdb) +#' or `Lineage` column. +#' @param filtType "whole" or "group" +#' @param geneType "presabs" or "copynum" +#' @param loadSummary default to FALSE, load summary information. +#' @param loadInfo default to FALSE, load info information. +#' @param loadDepth default to FALSE, load depth information. +#' @param only_stat only samples per species is returned (snpStat and geneStat) +#' @export +#' +loadMIDAS2 <- function(midas_merge_dir, + cl=NULL, + filtNum=2, + db="gtdb", + only_stat=FALSE, + filtBy="snps", + filtPer=0.8, + taxtbl=NULL, + candSp=NULL, + filtType="group", + geneType="copynum", + loadSummary=TRUE, + loadInfo=TRUE, + loadDepth=TRUE) { + stana <- new("stana") + if (only_stat) loadSummary <- TRUE + stana@type <- "MIDAS2" + stana@db <- db + if (!is.null(cl)) { + stana@cl <- cl + } + + if (db=="gtdb") { + tblCol <- "GTDB species" + } else if (db=="uhgg") { + tblCol <- "Lineage" + } else { + stop("Please provide gtdb or uhgg to `db`") + } + + ## Load summary for filtering + ## Probably filter based on these summaries, but what if + ## These files are not available and only species directories are present? + filePath <- paste0(midas_merge_dir,"/snps/snps_summary.tsv") + snpsSummary <- read.table(filePath, header=1) + stana@snpsSummary <- snpsSummary + filePath <- paste0(midas_merge_dir,"/genes/genes_summary.tsv") + genesSummary <- read.table(filePath, header=1) + stana@genesSummary <- genesSummary + + if (is.null(cl)) { + cl <- list("no_group"=unique(c(genesSummary$sample_name, snpsSummary$sample_name))) + stana@cl <- cl + } + + grnm <- NULL + for (sn in snpsSummary$sample_name) { + tmpgr <- NULL + for (i in names(cl)) { + if (sn %in% cl[[i]]) { + tmpgr <- c(tmpgr, i) + } + } + if (length(tmpgr)>1) { + stop("Sample in multiple groups") + } else { + grnm <- c(grnm, tmpgr) + } + } + + snpsSummary$group <- grnm + + grnm <- NULL + for (sn in genesSummary$sample_name) { + tmpgr <- NULL + for (i in names(cl)) { + if (sn %in% cl[[i]]) { + tmpgr <- c(tmpgr, i) + } + } + if (length(tmpgr)>1) { + stop("Sample in multiple groups") + } else { + grnm <- c(grnm, tmpgr) + } + } + + genesSummary$group <- grnm + snpRet <- snpsSummary |> dplyr::group_by(species_id) |> + dplyr::count(group) + geneRet <- genesSummary |> dplyr::group_by(species_id) |> + dplyr::count(group) + + ## Change tax name if available + if (!is.null(taxtbl)) { + nmdic <- taxtbl[[tblCol]] |> setNames(row.names(taxtbl)) + snpRet$species_name <- nmdic[as.character(snpRet$species_id)] + geneRet$species_name <- nmdic[as.character(geneRet$species_id)] + } + + snpRet$species_id <- as.character(snpRet$species_id) + geneRet$species_id <- as.character(geneRet$species_id) + + if (only_stat) { + return( + list("snps"=snpRet, + "genes"=geneRet) + ) + } + + stana@mergeDir <- midas_merge_dir + stana@geneType <- geneType + stana@sampleFilter <- filtType + stana@sampleFilterVal <- filtNum + stana@sampleFilterPer <- filtPer + + clearGn <- NULL + clearGnSp <- NULL + clearSn <- NULL + clearSnSp <- NULL + + snpStat <- NULL + geneStat <- NULL + + if (!is.null(candSp)) { + ispecNames <- candSp + } else { + if (filtBy=="snps") { + checker <- snpRet + } else { + checker <- geneRet + } + if (filtType=="whole") { + wn <- length(unlist(cl)) + ispecNames <- checker %>% group_by(species_id) %>% + summarise(grn=sum(n)) %>% + mutate(fnum=grn>filtNum, fper=grn>wn*filtPer) %>% + filter(fnum & fper) %>% pull(species_id) %>% unique() + } else if (filtType=="group") { + ispecNames <- lapply(names(cl), function(cn) { + gn <- length(cl[[cn]]) + checker %>% + filter(.data$group==cn) %>% + mutate(fnum=n>filtNum, fper=n>gn*filtPer) %>% + filter(fnum & fper) %>% pull(species_id) %>% unique() + }) + ispecNames <- Reduce(intersect, ispecNames) + } else { + stop("Please specify whole or group") + } + stana@sampleFilter <- filtType + } + + ## Ensure that the directory is present + snpsSpecNames <- list.files(paste0(midas_merge_dir,"/snps")) + specNames <- intersect(snpsSpecNames, ispecNames) + if (length(specNames)!=0) stana@clearSnps <- specNames + + snpsList <- lapply(specNames, function(i) { + if (!is.na(as.numeric(i))) {## Check that ID is number + qqcat(" @{i}\n") + if (!is.null(taxtbl)){ + spnm <- taxtbl[i,][[tblCol]] + qqcat(" @{spnm}\n") + } + cnc <- paste0(midas_merge_dir,"/snps/",i,"/",i,".snps_freqs.tsv.lz4") + cnd <- gsub(".lz4","",cnc) + system2("lz4", args=c("-d","-f", + paste0(getwd(),"/",cnc), + paste0(getwd(),"/",cnd)), + stdout=FALSE, stderr=FALSE) + df <- read.table(cnd, row.names=1, header=1) + qqcat(" Number of snps: @{dim(df)[1]}\n") + qqcat(" Number of samples: @{dim(df)[2]}\n") + unlink(paste0(getwd(),"/",cnd)) + return(df) + } + }) %>% setNames(specNames) + + ## Info + if (loadInfo) { + snpInfoList <- lapply(specNames, function(i) { + cnc <- paste0(midas_merge_dir,"/snps/",i,"/",i,".snps_info.tsv.lz4") + cnd <- gsub(".lz4","",cnc) + system2("lz4", args=c("-d","-f", + paste0(getwd(),"/",cnc), + paste0(getwd(),"/",cnd)), + stdout=FALSE, stderr=FALSE) + info <- read.table(cnd, row.names=1, header=1) + unlink(paste0(getwd(),"/",cnd)) + return(info) + }) %>% setNames(specNames) + stana@snpsInfo <- snpInfoList + } + + if (loadDepth) { + snpDepthList <- lapply(specNames, function(i) { + cnc <- paste0(midas_merge_dir,"/snps/",i,"/",i,".snps_depth.tsv.lz4") + cnd <- gsub(".lz4","",cnc) + system2("lz4", args=c("-d","-f", + paste0(getwd(),"/",cnc), + paste0(getwd(),"/",cnd)), + stdout=FALSE, stderr=FALSE) + depth <- read.table(cnd, row.names=1, header=1) + unlink(paste0(getwd(),"/",cnd)) + return(depth) + }) %>% setNames(specNames) + stana@snpsDepth <- snpDepthList + } + + ## Ensure that the directory is present + genesSpecNames <- list.files(paste0(midas_merge_dir,"/genes")) + specNames <- intersect(genesSpecNames, ispecNames) + if (length(specNames)!=0) stana@clearGenes <- specNames + + geneList <- lapply(specNames, function(i) { + if (!is.na(as.numeric(i))) { + qqcat(" @{i}\n") + if (!is.null(taxtbl)){ + spnm <- taxtbl[i,][[tblCol]] + qqcat(" @{spnm}\n") + } + cnc <- paste0(midas_merge_dir, + "/genes/",i,"/",i,".genes_",geneType,".tsv.lz4") + cnd <- gsub(".lz4","",cnc) + system2("lz4", args=c("-d", "-f", + paste0(getwd(),"/",cnc), + paste0(getwd(),"/",cnd)), + stdout=FALSE, stderr=FALSE) + df <- read.table(cnd, row.names=1, header=1) + qqcat(" Number of genes: @{dim(df)[1]}\n") + qqcat(" Number of samples: @{dim(df)[2]}\n") + unlink(paste0(getwd(),"/",cnd)) + df + } + }) %>% setNames(specNames) + + + + stana@snps <- snpsList + stana@genes <- geneList + stana@ids <- union(names(geneList),names(snpsList)) + stana <- initializeStana(stana,cl) + + if (!is.null(taxtbl)){ + stana@clearSnpsSpecies <- taxtbl[stana@clearSnps,][[tblCol]] + stana@clearGenesSpecies <- taxtbl[stana@clearGenes,][[tblCol]] + } + stana +} \ No newline at end of file diff --git a/R/stana.R b/R/stana.R index 9a8bdb4..c4ba4df 100644 --- a/R/stana.R +++ b/R/stana.R @@ -24,6 +24,7 @@ setClass("stana", slots=list( snpsInfo="list", snpsDepth="list", snpsSummary="data.frame", + genesSummary="data.frame", relab="data.frame", geneType="character", genes="list", diff --git a/man/calcKO.Rd b/man/calcKO.Rd new file mode 100644 index 0000000..489a840 --- /dev/null +++ b/man/calcKO.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doGSEA.R +\name{calcKO} +\alias{calcKO} +\title{calcKO} +\usage{ +calcKO(stana, candSp = NULL, how = mean) +} +\description{ +calcKO +} diff --git a/man/doGSEA.Rd b/man/doGSEA.Rd new file mode 100644 index 0000000..15fe46d --- /dev/null +++ b/man/doGSEA.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doGSEA.R +\name{doGSEA} +\alias{doGSEA} +\title{doGSEA} +\usage{ +doGSEA(stana, candSp = NULL, cl = NULL, eps = 0.01, how = mean) +} +\description{ +doGSEA +} diff --git a/man/loadMIDAS2.Rd b/man/loadMIDAS2.Rd index f04f716..ca41a74 100644 --- a/man/loadMIDAS2.Rd +++ b/man/loadMIDAS2.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/loadMIDAS2.R +% Please edit documentation in R/loadMIDAS2Refine.R \name{loadMIDAS2} \alias{loadMIDAS2} \title{loadMIDAS2} @@ -10,6 +10,7 @@ loadMIDAS2( filtNum = 2, db = "gtdb", only_stat = FALSE, + filtBy = "snps", filtPer = 0.8, taxtbl = NULL, candSp = NULL, @@ -31,6 +32,8 @@ loadMIDAS2( \item{only_stat}{only samples per species is returned (snpStat and geneStat)} +\item{filtBy}{filter by {snps} number or {genes} number (default to snps)} + \item{filtPer}{filter by fraction} \item{taxtbl}{tax table, row.names: 6-digits MIDAS2 ID and `GTDB species` (gtdb) diff --git a/man/loadMIDAS2Legacy.Rd b/man/loadMIDAS2Legacy.Rd new file mode 100644 index 0000000..cb177fa --- /dev/null +++ b/man/loadMIDAS2Legacy.Rd @@ -0,0 +1,56 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/loadMIDAS2.R +\name{loadMIDAS2Legacy} +\alias{loadMIDAS2Legacy} +\title{loadMIDAS2Legacy} +\usage{ +loadMIDAS2Legacy( + midas_merge_dir, + cl = NULL, + filtNum = 2, + db = "gtdb", + only_stat = FALSE, + filtPer = 0.8, + taxtbl = NULL, + candSp = NULL, + filtType = "group", + geneType = "copynum", + loadSummary = TRUE, + loadInfo = TRUE, + loadDepth = TRUE +) +} +\arguments{ +\item{midas_merge_dir}{path to merged directory} + +\item{cl}{named list of category for samples} + +\item{filtNum}{the species with the samples above this number will be returned} + +\item{db}{data base that was used to profile, default to gtdb.} + +\item{only_stat}{only samples per species is returned (snpStat and geneStat)} + +\item{filtPer}{filter by fraction} + +\item{taxtbl}{tax table, row.names: 6-digits MIDAS2 ID and `GTDB species` (gtdb) +or `Lineage` column.} + +\item{candSp}{candidate species ID} + +\item{filtType}{"whole" or "group"} + +\item{geneType}{"presabs" or "copynum"} + +\item{loadSummary}{default to FALSE, load summary information.} + +\item{loadInfo}{default to FALSE, load info information.} + +\item{loadDepth}{default to FALSE, load depth information.} +} +\description{ +load the MIDAS2 merge command output (outdated). +The location to lz4 binary must be added to PATH. +Taxonomy table can be loaded from downloaded MIDAS2 db directory (metadata.tsv). +If provided, additionally show tax names. +} diff --git a/man/obtain_positions.Rd b/man/obtain_positions.Rd new file mode 100644 index 0000000..eba7885 --- /dev/null +++ b/man/obtain_positions.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doGSEA.R +\name{obtain_positions} +\alias{obtain_positions} +\title{obtain_positions} +\usage{ +obtain_positions(stana, candSp, geneID) +} +\description{ +obtain_positions +} diff --git a/man/reverse_annot.Rd b/man/reverse_annot.Rd new file mode 100644 index 0000000..7045603 --- /dev/null +++ b/man/reverse_annot.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doGSEA.R +\name{reverse_annot} +\alias{reverse_annot} +\title{reverse_annot} +\usage{ +reverse_annot(stana, candSp, candidate, col = "KEGG_ko") +} +\description{ +reverse_annot +} diff --git a/man/summariseAbundance.Rd b/man/summariseAbundance.Rd index 51f1f08..561d266 100644 --- a/man/summariseAbundance.Rd +++ b/man/summariseAbundance.Rd @@ -4,7 +4,7 @@ \alias{summariseAbundance} \title{summariseAbundance} \usage{ -summariseAbundance(stana, sp, anno, how = mean) +summariseAbundance(stana, sp, anno, how = mean, verbose = FALSE) } \arguments{ \item{stana}{stana object} @@ -14,6 +14,8 @@ summariseAbundance(stana, sp, anno, how = mean) \item{anno}{annotation tibble obtained by checkEGGNOG} \item{how}{summarising function, default to mean} + +\item{verbose}{output messages} } \value{ data.frame