Skip to content

Commit

Permalink
group labels umap
Browse files Browse the repository at this point in the history
  • Loading branch information
trvinh committed Aug 29, 2024
1 parent d0a4e5f commit aca3ac1
Show file tree
Hide file tree
Showing 9 changed files with 193 additions and 87 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: PhyloProfile
Version: 1.19.8
Date: 2024-08-26
Version: 1.19.9
Date: 2024-08-29
Title: PhyloProfile
Authors@R: c(
person("Vinh", "Tran", role = c("aut", "cre"), email = "[email protected]", comment=c(ORCID="0000-0001-6772-7595")),
Expand Down
1 change: 0 additions & 1 deletion R/createDomainPlot.R
Original file line number Diff line number Diff line change
Expand Up @@ -709,7 +709,6 @@ joinPlotMergeLegends <- function(
length(levels(as.factor(df2$feature)))
)
)

# create a temp plot that contains all features
mergedDf <- rbind(df1, df2)
colorScheme <- structure(
Expand Down
1 change: 1 addition & 0 deletions R/createTaxonomyMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ getTaxonomyInfo <- function(inputTaxa = NULL, currentNCBIinfo = NULL) {
refEntry <- currentNCBIinfo[currentNCBIinfo$ncbiID == refID, ]
lastID <- refEntry$parentID
inputTaxaInfo <- refEntry
if(length(lastID) == 0) stop(paste("ERROR with", refID))
while (lastID != 1) {
if (lastID %in% names(tmp)) {
inputTaxaInfo <- rbindlist(
Expand Down
45 changes: 24 additions & 21 deletions R/umapClustering.R
Original file line number Diff line number Diff line change
Expand Up @@ -182,9 +182,10 @@ umapClustering <- function(

#' Reduce the number of labels for UMAP plot based on the gene/taxon frequency
#' @export
#' @usage groupLabelUmapData(data4umap = NULL, freqCutoff = 0)
#' @usage groupLabelUmapData(data4umap = NULL, freqCutoff = c(0,200))
#' @param data4umap data for UMAP clustering (output from prepareUmapData)
#' @param freqCutoff gene/taxon frequency cutoff
#' @param freqCutoff gene/taxon frequency cutoff range. Any labels that are
#' outside of this range will be assigned as [Other]
#' @return A dataframe similar to input data4umap, but with modified Label
#' column, where less frequent labels are grouped together as "Other"
#' @author Vinh Tran [email protected]
Expand All @@ -195,26 +196,26 @@ umapClustering <- function(
#' )
#' longDf <- createLongMatrix(rawInput)
#' data4umap <- prepareUmapData(longDf, "phylum")
#' groupLabelUmapData(data4umap, freqCutoff = 3)
#' groupLabelUmapData(data4umap, freqCutoff = c(3,5))

groupLabelUmapData <- function(data4umap = NULL, freqCutoff = 0) {
groupLabelUmapData <- function(data4umap = NULL, freqCutoff = c(0,200)) {
if (is.null(data4umap)) stop("Input data cannot be NULL!")
if (length(data4umap) == 0) stop("Input data cannot be EMPTY!")

# minFreq <- tail(sort(unique(data4umap$n)), labelNr)[1]
# keepLabel <- unique(data4umap$Label[data4umap$n >= minFreq])
# data4umap$Label[!(data4umap$Label %in% keepLabel)] <- "[Other]"
data4umap$Label[data4umap$n < freqCutoff] <- "[Other]"
data4umap$Label[
data4umap$n < freqCutoff[1] | data4umap$n > freqCutoff[2]
] <- "[Other]"
return(data4umap)
}

#' Create UMAP cluster plot
#' @export
#' @usage createUmapPlotData(umapData = NULL, data4umap = NULL, freqCutoff = 0,
#' excludeTaxa = "None")
#' @usage createUmapPlotData(umapData = NULL, data4umap = NULL,
#' freqCutoff = c(0,200), excludeTaxa = "None")
#' @param umapData data contains UMAP cluster (output from umapClustering())
#' @param data4umap data for UMAP clustering (output from prepareUmapData())
#' @param freqCutoff gene/taxon frequency cutoff
#' @param freqCutoff gene/taxon frequency cutoff range. Any labels that are
#' outside of this range will be assigned as [Other]
#' @param excludeTaxa hide taxa from plot. Default: "None"
#' @importFrom utils tail
#' @return A plot as ggplot object
Expand All @@ -230,7 +231,8 @@ groupLabelUmapData <- function(data4umap = NULL, freqCutoff = 0) {
#' createUmapPlotData(umapData, data4umap)

createUmapPlotData <- function(
umapData = NULL, data4umap = NULL, freqCutoff = 0, excludeTaxa = "None"
umapData = NULL, data4umap = NULL, freqCutoff = c(0, 200),
excludeTaxa = "None"
) {
if (is.null(umapData) | is.null(data4umap))
stop("Input data cannot be NULL!")
Expand All @@ -241,8 +243,7 @@ createUmapPlotData <- function(
data4umap$X <- umapData$layout[,1]
data4umap$Y <- umapData$layout[,2]
# join less freq items into "other"
# data4umap <- groupLabelUmapData(data4umap, freqCutoff)
data4umap$Label[data4umap$n < freqCutoff] <- "[Other]"
data4umap <- groupLabelUmapData(data4umap, freqCutoff)
# exclude taxa
if (length(excludeTaxa) > 0) {
data4umap$X[data4umap$Label %in% excludeTaxa] <- NA
Expand All @@ -254,7 +255,7 @@ createUmapPlotData <- function(

#' Create UMAP cluster plot
#' @export
#' @usage plotUmap(plotDf = NULL, legendPos = "right", colorPalette = "Set2",
#' @usage plotUmap(plotDf = NULL, legendPos = "bottom", colorPalette = "Set2",
#' transparent = 0, textSize = 12, font = "Arial", highlightTaxa = NULL)
#' @param plotDf data for UMAP plot
#' @param legendPos position of legend. Default: "right"
Expand All @@ -278,7 +279,7 @@ createUmapPlotData <- function(
#' plotUmap(plotDf, font = "sans")

plotUmap <- function(
plotDf = NULL, legendPos = "right", colorPalette = "Set2",
plotDf = NULL, legendPos = "bottom", colorPalette = "Set2",
transparent = 0, textSize = 12, font = "Arial", highlightTaxa = NULL
) {
if (is.null(plotDf)) stop("Input data cannot be NULL!")
Expand All @@ -300,6 +301,8 @@ plotUmap <- function(
), .Names = levels(as.factor(plotDf$Label))
)
}
# adapt plot height based on number of labels

# generate plot
plot <- ggplot(plotDf, aes(x = X, y = Y, color = Label)) +
geom_point(aes(size = Freq), alpha = 1 - transparent) +
Expand All @@ -309,13 +312,13 @@ plotUmap <- function(
# change legend title
if ("ncbiID" %in% colnames(plotDf)) {
plot <- plot + guides(
color = guide_legend(override.aes = list(alpha = 1)),
size = guide_legend(title = "Number of genes")
color = guide_legend(override.aes = list(alpha = 1), ncols = 5),
size = guide_legend(title = "Number of genes", ncols = 2)
)
} else
} else
plot <- plot + guides(
color = guide_legend(override.aes = list(alpha = 1)),
size = guide_legend(title = "Number of taxa")
color = guide_legend(override.aes = list(alpha = 1), ncols = 5),
size = guide_legend(title = "Number of taxa", ncols = 1)
)
plot <- plot + theme(
legend.position = legendPos,
Expand Down
136 changes: 105 additions & 31 deletions inst/PhyloProfile/server.R
Original file line number Diff line number Diff line change
Expand Up @@ -3122,15 +3122,47 @@ shinyServer(function(input, output, session) {
shinyjs::disable("umapDataType")
}
})

# * data for UMAP clustering -----------------------------------------------
umapData <- reactive({
req(getMainInput())
req(u$doUmapPlot)
if(is.null(getMainInput())) stop("Input data is NULL!")
withProgress(
message = "Preparing data for clustering...", value = 0.5, {
umapData <- prepareUmapData(
getMainInput(), input$umapRank, input$umapClusteringType,
getTaxDBpath(), input$umapFilterVar, input$umapCutoff,
input$umapGroupLabelsBy
)
return(umapData)
}
)
})

# * read custom labels -----------------------------------------------------
values <- reactiveValues(
uploadLabelState = NULL
)

output$umapCustomLabel.ui <- renderUI({
if (input$umapClusteringType == "taxa")
fileInput("umapCustomLabel", "Add customize labels")
fileInput("umapCustomLabel", "Add customized labels")
})

observeEvent(input$input$umapCustomLabel, {
values$uploadLabelState <- 'uploaded'
})

observeEvent(input$umapResetLables, {
values$uploadLabelState <- NULL
updateTextInput(session, "umapGroupHigherRank", value = "")
})

getCustomLabels <- reactive({
req(getMainInput())
if (is.null(values$uploadLabelState))
return(data.frame(ncbiID = c(), label = c()))
filein <- input$umapCustomLabel
if (!is.null(filein)) {
customLabels <- read.table(
Expand All @@ -3142,39 +3174,80 @@ shinyServer(function(input, output, session) {
return(customLabels[customLabels$ncbiID %in% mainInput$ncbiID,])
} else return(data.frame(ncbiID = c(), label = c()))
})

# * data for UMAP clustering -----------------------------------------------
umapData <- reactive({
req(getMainInput())
req(u$doUmapPlot)
if(is.null(getMainInput())) stop("Input data is NULL!")
withProgress(
message = "Preparing data for clustering...", value = 0.5, {
umapData <- prepareUmapData(
getMainInput(), input$umapRank, input$umapClusteringType,
getTaxDBpath(), input$umapFilterVar, input$umapCutoff,
input$umapGroupLabelsBy
)
if (input$umapClusteringType == "taxa") {
customLabels <- getCustomLabels()
if(nrow(customLabels) > 0) {
umapData$Label[
umapData$ncbiID %in% customLabels$ncbiID
] <- customLabels$Label

# * apply user-defined labels ----------------------------------------------
renameLabelsUmap <- reactive({
req(umapData())
input$umapApplyChangeLables
umapData <- umapData()
isolate({
if (input$umapClusteringType == "taxa") {
# group labels into higher rank
higherRankTaxa <- unlist(strsplit(input$umapGroupHigherRank, ";"))
higherRankTaxa <- trimws(higherRankTaxa)
if (length(higherRankTaxa) > 0) {
taxMatrix <- getTaxonomyMatrix(getTaxDBpath())
nameList <- getNameList(getTaxDBpath())
selDf <- data.frame(
selRank = nameList$rank[nameList$fullName %in% higherRankTaxa],
selID = nameList$ncbiID[nameList$fullName %in% higherRankTaxa],
Label = nameList$fullName[nameList$fullName %in% higherRankTaxa]
)
selDf <- selDf[complete.cases(selDf),]
if (nrow(selDf) > 0) {
selTaxList <- lapply(
seq(nrow(selDf)), function (x) {
selRank <- selDf$selRank[x]
selID <- selDf$selID[x]
if (!(selRank %in% mainTaxonomyRank()))
selRank <- paste0("norank_", selID)
selRank <- quo(!! sym(selRank))
selTaxDf <- taxMatrix %>%
filter((!!selRank) %in% selID) %>%
select(abbrName, !!selRank)
colnames(selTaxDf) <- c("ncbiID", "supertaxonID")
selTaxDf$sel_label <- selDf$Label[selDf$selID == selID]
return(selTaxDf)
}
)
joinedSelTaxDf <- do.call(rbind, selTaxList)
joinedSelTaxDf <- joinedSelTaxDf %>% group_by(ncbiID) %>%
filter(supertaxonID == min(supertaxonID))
umapData <- left_join(umapData, joinedSelTaxDf, by = "ncbiID") %>%
mutate(Label = ifelse(!is.na(sel_label), sel_label, Label)) %>%
select(-c(supertaxonID, sel_label))
}
}
return(umapData)
# apply custom labels (if provided)
customLabels <- getCustomLabels()
if(nrow(customLabels) > 0) {
umapData$Label[
umapData$ncbiID %in% customLabels$ncbiID
] <- customLabels$Label
}
}
)
})
return(umapData)
})

output$umapGroupHigherRank.warning <- renderUI({
req(input$umapGroupHigherRank)
if (length(input$umapGroupHigherRank) > 0) {
list(
em(paste("Click `Change labels` to apply. If you don't see your",
"specified labels, please check for typos!")),
br(),br()
)
}
})

# * UMAP clustered data ----------------------------------------------------
umapCluster <- reactive({
req(umapData())
req(renameLabelsUmap())
withProgress(
message = "Performing UMAP clustering...", value = 0.5, {
umapData.umap <- umapClustering(
umapData(), input$umapClusteringType, input$umapDataType
renameLabelsUmap(), input$umapClusteringType, input$umapDataType
)
return(umapData.umap)
}
Expand All @@ -3183,8 +3256,8 @@ shinyServer(function(input, output, session) {

# * generate list of UMAP labels -------------------------------------------
output$umapTaxa.ui <- renderUI({
req(umapData())
df <- groupLabelUmapData(umapData(), input$umapLabelNr)
req(renameLabelsUmap())
df <- groupLabelUmapData(renameLabelsUmap(), input$umapLabelNr)
list(
selectInput(
"excludeUmapTaxa", "Choose labels to hide", multiple = TRUE,
Expand All @@ -3205,7 +3278,8 @@ shinyServer(function(input, output, session) {
selectFreq <- tail(freqList, 5)[1]
updateSliderInput(
session, "umapLabelNr", "Freq cutoff", min = freqList[1],
max = tail(freqList, 1), step = 1, value = selectFreq
max = tail(freqList, 1), step = 1,
value = c(selectFreq, tail(freqList, 1))
)
})

Expand All @@ -3216,9 +3290,9 @@ shinyServer(function(input, output, session) {
req(getMainInput())
if(is.null(getMainInput())) stop("Input data is NULL!")
req(umapCluster())
req(umapData())
req(renameLabelsUmap())
plotDf <- createUmapPlotData(
umapCluster(), umapData(), freqCutoff = input$umapLabelNr,
umapCluster(), renameLabelsUmap(), freqCutoff = input$umapLabelNr,
excludeTaxa = input$excludeUmapTaxa
)
return(plotDf)
Expand Down Expand Up @@ -3260,7 +3334,7 @@ shinyServer(function(input, output, session) {
)
)
})

# When a double-click happens, check if there's a brush on the plot
# If so, zoom to the brush bounds; if not, reset the zoom.
observeEvent(input$umapdblClick, {
Expand Down Expand Up @@ -3302,7 +3376,7 @@ shinyServer(function(input, output, session) {
c("umapData.RData")
},
content = function(fileName) {
data4umap <- umapData()
data4umap <- renameLabelsUmap()
umapClusteredData <- umapCluster()
umapPlotData <- umapPlotData()
save(data4umap, umapClusteredData, umapPlotData, file = fileName)
Expand Down
Loading

0 comments on commit aca3ac1

Please sign in to comment.