Improved () method. Also tests for that method added. Closes #44.

iferres · May 7, 2021 · 24d51aa · 24d51aa
1 parent af17214
commit 24d51aa
Show file tree

Hide file tree

Showing 3 changed files with 93 additions and 22 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -2,6 +2,8 @@
 
 * Added panaroo_2_pagoo function to read the output of the panaroo pangenome reconstruction software.
 
+* Improved $add_metadata() method. Now columns with the same name as the one already present are overwritten instead of duplicated (closes #44). Also, users can now provide metadata covering the pangenome partially, i.e, not providing all the gene/clusters/organisms in the mapping column. Those entities without metadata provided are filled with NAs.
+
 # pagoo 0.3.8
 
 * Improve backward compatibility. Older pagoo objects created by third party packages which depend on pagoo do not have an attribute required to successfully load them into session. Now the approach is to downgrade the object to a base pagoo class, or to provide the namespace using the pkg argument.

diff --git a/R/PgR6.R b/R/PgR6.R
@@ -201,48 +201,67 @@ PgR6 <- R6Class('PgR6',
                     }
 
                     if (map == 'org'){
+
                       if('org'%in%colnames(data)){
-                        ma <- match(private$.organisms$org, data$org)
+                        ma <- match(data$org, private$.organisms$org)
                         if (any(is.na(ma))) stop('data$org do not match with object organisms.')
-                        if (dim(data)[1]!=dim(private$.organisms)[1]){
-                          stp <- paste('data has', dim(data)[1], 'rows while object require', dim(private$.organisms)[1],'.')
-                          stop(stp)
-                        }
-                        data <- data[ma, ]
                         oc <- which(colnames(data)=='org')
-                        private$.organisms <- cbind(private$.organisms, DataFrame(data[, -oc, drop=F]))
+                        nwcls <- colnames(data)[-oc]
+                        repcol <- nwcls %in% colnames(private$.organisms)
+                        invisible(lapply(which(repcol), function(x){
+                          wr <- paste0("Column '", nwcls[x], "' already exists, overwriting.")
+                          warning(wr, immediate. = TRUE)
+                          private$.organisms[[nwcls[x]]] <- NULL
+                        }))
+                        for (i in seq_along(nwcls)){
+                          private$.organisms[[nwcls[i]]][ma] <- data[[nwcls[i]]]
+                        }
                       }else{
                         stop('"data" should contain an "org" column.')
                       }
+
                     }else if (map == 'cluster'){
+
                       if('cluster'%in%colnames(data)){
-                        ma <- match(private$.clusters$cluster, data$cluster)
+                        ma <- match(data$cluster, private$.clusters$cluster)
                         if (any(is.na(ma))) stop('data$cluster do not match with object "cluster" column.')
-                        if (dim(data)[1]!=dim(private$.clusters)[1]){
-                          stp <- paste('data has', dim(data)[1], 'rows while object require', dim(private$.clusters)[1],'.')
-                          stop(stp)
-                        }
-                        data <- data[ma, ]
                         oc <- which(colnames(data)=='cluster')
-                        private$.clusters <- cbind(private$.clusters, DataFrame(data[, -oc, drop=F]))
+                        nwcls <- colnames(data)[-oc]
+                        repcol <- nwcls %in% colnames(private$.clusters)
+                        invisible(lapply(which(repcol), function(x){
+                          wr <- paste0("Column '", nwcls[x], "' already exists, overwriting.")
+                          warning(wr, immediate. = TRUE)
+                          private$.clusters[[nwcls[x]]] <- NULL
+                        }))
+                        for (i in seq_along(nwcls)){
+                          private$.clusters[[nwcls[i]]][ma] <- data[[nwcls[i]]]
+                        }
                       }else{
-                        stop('"data" should contain an "cluster" column.')
+                        stop('"data" should contain a "cluster" column.')
                       }
+
                     }else{
+
                       if('gid'%in%colnames(data)){
-                        ma <- match(private$.data$gid, data$gid)
+                        ma <- match(data$gid, private$.data$gid)
                         if (any(is.na(ma))) stop('data$gid do not match with object gid.')
-                        if (dim(data)[1]!=dim(private$.data)[1]){
-                          stp <- paste('data has', dim(data)[1], 'rows while object require', dim(private$.data)[1],'.')
-                          stop(stp)
+                        oc <- which(colnames(data) %in% c("cluster", "org", "gene", "gid"))
+                        nwcls <- colnames(data)[-oc]
+                        repcol <- nwcls %in% colnames(private$.data)
+                        invisible(lapply(which(repcol), function(x){
+                          wr <- paste0("Column '", nwcls[x], "' already exists, overwriting.")
+                          warning(wr, immediate. = TRUE)
+                          private$.data[[nwcls[x]]] <- NULL
+                        }))
+                        for (i in seq_along(nwcls)){
+                          private$.data[[nwcls[i]]][ma] <- data[[nwcls[i]]]
                         }
-                        data <- data[ma, ]
-                        oc <- which(colnames(data)=='gid')
-                        private$.data <- cbind(private$.data, DataFrame(data[, -oc, drop=F]))
                       }else{
                         stop('"data" should contain a "gid" column.')
                       }
+
                     }
+
                     invisible(self)
                   },
 

diff --git a/tests/testthat/test_input.R b/tests/testthat/test_input.R
@@ -151,6 +151,56 @@ test_that('$save_pangenomeRDS() and load_pangenome() works', {
   file.remove(out_rds)
 })
 
+
+test_that("Adding gene metadata after object creation works.",{
+  sep <- "__"
+  p <- pagoo(data = data[, -4], sep = sep)
+  df <- data.frame(gid = paste(data$org, data$gene, sep = sep),
+                   annot = data[, 4])
+  p$add_metadata(map = "gid", df)
+  # p2 <- pagoo(data, sep = sep)
+
+  expect_is(p, "R6")
+  expect_is(p, "PgR6")
+  expect_is(p, "PgR6M")
+
+  expect_is(p$genes, "DFrameList")
+  expect_length(unlist(p$genes, use.names = FALSE), 5)
+  expect_named(unlist(p$genes, use.names = FALSE),
+               c("cluster", "org", "gene", "gid", "annot"))
+  # expect_equal(p$genes, p2$genes)
+})
+
+test_that("Adding cluster metadata after object creation works.",{
+  sep <- "__"
+  p <- pagoo(data = data, sep = sep)
+  p$add_metadata(map = "cluster", clust_meta)
+
+  expect_is(p, "R6")
+  expect_is(p, "PgR6")
+  expect_is(p, "PgR6M")
+
+  expect_is(p$organisms, "DFrame")
+  expect_length(p$clusters, dim(clust_meta)[2])
+  expect_named(p$clusters, names(clust_meta))
+  # expect_equal(p$clusters, p2$clusters)
+})
+
+test_that("Adding organism metadata after object creation works.",{
+  sep <- "__"
+  p <- pagoo(data = data, sep = sep)
+  p$add_metadata(map = "org", orgs_meta)
+
+  expect_is(p, "R6")
+  expect_is(p, "PgR6")
+  expect_is(p, "PgR6M")
+
+  expect_is(p$organisms, "DFrame")
+  expect_length(p$organisms, dim(orgs_meta)[2])
+  expect_named(p$organisms, names(orgs_meta))
+  # expect_equal(p$organisms, p2$organisms)
+})
+
 ## Input from third party pangenome reconstruction software
 #   MISSING