merge

jdieramon · May 16, 2024 · 2ba8d39 · 2ba8d39
2 parents f0baff6 + c2cabf0
commit 2ba8d39
Show file tree

Hide file tree

Showing 11 changed files with 89 additions and 53 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: refseqR
 Type: Package
 Title: Common Computational Operations Working with RefSeq Entries (GenBank)
-Version: 1.0.1
+Version: 1.0.2
 Authors@R: 
     c(person("Jose V.", "Die", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0002-7506-8590")),

diff --git a/R/refseq_AAseq.R b/R/refseq_AAseq.R
@@ -39,3 +39,4 @@ refseq_AAseq <- function(xp) {
   my_aa
 
 }
+
diff --git a/R/refseq_CDSseq.R b/R/refseq_CDSseq.R
@@ -30,9 +30,9 @@ refseq_CDSseq <- function(xm) {
   res     <-  refseq_CDScoords(xm)
   my_cds  <-  sapply(seq(res), function(i) {
     cds   <-  rentrez::entrez_fetch(db = "nuccore", id = names(res)[i],
-                                rettype = "fasta",
-                                seq_start = start(res)[i],
-                                seq_stop = end(res)[i])
+                                    rettype = "fasta",
+                                    seq_start = start(res)[i],
+                                    seq_stop = end(res)[i])
     cds_tidy <-  strsplit(cds, "\n")
     cds_tidy <- as.character(paste0(cds_tidy[[1]][2:length(cds_tidy[[1]])], collapse = ""))
 

diff --git a/R/refseq_XPlength.R b/R/refseq_XPlength.R
@@ -3,9 +3,11 @@
 #' @description \code{refseq_XPlength()} Returns the amino acid length from a single XP accession.
 #'
 #' @usage
-#' refseq_XPlength(xp)
+#' refseq_XPlength(xp, retries)
 #'
 #' @param xp A character string of the XP id.
+#' @param retries A numeric value to control the number of retry attempts to handle internet errors.
+#'
 #'
 #' @returns A numeric value representing the aa length of the protein especified as `xp`.
 #'
@@ -14,16 +16,28 @@
 #' @examples
 #'  # Get the XM ids from a set of XP accessions
 #'  xp = c("XP_004487758", "XP_004488550")
-#'  sapply(xp, function(x) refseq_XPlength(x), USE.NAMES = FALSE)
+#'  sapply(xp, function(x) refseq_XPlength(x, retries = 3), USE.NAMES = FALSE)
 #'
 #' @author Jose V. Die
 #'
 #' @export
 
-refseq_XPlength <- function(xp) {
-  # Get the item list for that protein id
-  xpinfo <- rentrez::entrez_summary(db = "protein", id = xp)
+refseq_XPlength <- function(xp, retries = 3) {
+  tryCatch({
+    # Get the item list for that protein id
+    xpinfo <- rentrez::entrez_summary(db = "protein", id = xp)
+  }, error = function(e){
+    if (inherits(e, "error")) {
+      if (grepl("HTTP error: 502", e$message) && retries > 0) {
+        message("Retrying...\n")
+        Sys.sleep(5)  # Wait for 5 seconds before retrying
+        return(refseq_XPlength(xp, retries - 1))
+      } else {
+        stop(e)
+      }
+    }
+  })
   # Get the protein length
-  xpinfo$slen
+  return(xpinfo$slen)
 }
 
diff --git a/R/refseq_fromGene.R b/R/refseq_fromGene.R
@@ -76,4 +76,4 @@ refseq_fromGene <- function(gene_symbol , sequence = "XM", retries = 3) {
     }
   })
   return(ncbi)
-  }
+}
diff --git a/R/refseq_fromXM.R b/R/refseq_fromXM.R
@@ -50,8 +50,8 @@ refseq_fromXM <- function(xm, feat) {
 
     # Build dataframe
     df <-  data.frame(matrix(unlist(mrna), nrow = length(mrna)/length(feat),
-                           byrow = T),
-                    stringsAsFactors = F)
+                             byrow = T),
+                      stringsAsFactors = F)
 
     colnames(df) = feat
 

diff --git a/R/refseq_geneSymbol.R b/R/refseq_geneSymbol.R
@@ -3,10 +3,11 @@
 #' @description \code{refseq_geneSymbol()} Returns the gene symbol from a single Gene id. accession.
 #'
 #' @usage
-#' refseq_geneSymbol (id, db)
+#' refseq_geneSymbol (id, db, retries)
 #'
 #' @param id A character string of the XP or XM id.
 #' @param db A character string of the "nuccore" or "protein" database.
+#' @param retries A numeric value to control the number of retry attempts to handle internet errors.
 #'
 #' @returns A character vector containing the gene symbol corresponding to the especified accession as `id`.
 #'
@@ -16,27 +17,43 @@
 #' @examples
 #' # Get the gene symbol from a set of XM accessions
 #' xm = c("XM_004487701", "XM_004488493")
-#' sapply(xm, function(x) refseq_geneSymbol (x, db = "nuccore"), USE.NAMES = FALSE)
+#' sapply(xm, function(x) refseq_geneSymbol (x, db = "nuccore", retries = 3), USE.NAMES = FALSE)
 #'
 #' # Get the gene symbol from a set of XP accessions
 #' xp = c("XP_004487758")
-#' sapply(xp, function(x) refseq_geneSymbol (x, db = "protein"), USE.NAMES = FALSE)
+#' sapply(xp, function(x) refseq_geneSymbol (x, db = "protein", retries = 3), USE.NAMES = FALSE)
 #'
 #' @author Jose V. Die
 #'
 #' @export
 
-refseq_geneSymbol <- function(id, db = "protein") {
-  if (db == "protein") {
-    id_elink = rentrez::entrez_link(dbfrom = "protein", id = id, db= "gene")
-    gene_id = id_elink$links$protein_gene
-  } else {
-    id_elink = rentrez::entrez_link(dbfrom = "nuccore", id = id, db= "gene")
-    gene_id = id_elink$links$nuccore_gene
-  }
+refseq_geneSymbol <- function(id, db = "protein", retries = 3) {
+
+  tryCatch({
+
+    if (db == "protein") {
+      id_elink = rentrez::entrez_link(dbfrom = "protein", id = id, db= "gene")
+      gene_id = id_elink$links$protein_gene
+
+    } else {
+      id_elink = rentrez::entrez_link(dbfrom = "nuccore", id = id, db= "gene")
+      gene_id = id_elink$links$nuccore_gene
+    }
+
+  }, error = function(e) {
+    if (inherits(e, "error")) {
+      if (grepl("HTTP error: 502", e$message) && retries > 0) {
+        message("Retrying...\n")
+        Sys.sleep(5)  # Wait for 5 seconds before retrying
+        return(refseq_geneSymbol(id, db, retries - 1))
+      } else {
+        stop(e)
+      }
+
+    }
+  })
 
   gene_summ = rentrez::entrez_summary(db = "gene", id = gene_id)
-  gene_summ$name
+  return(gene_summ$name)
 
 }
-
diff --git a/R/refseq_mol_wt.R b/R/refseq_mol_wt.R
@@ -38,27 +38,27 @@ refseq_mol_wt <- function(xp) {
 
   mol_wt <-  0 # keep track of success
 
-    listName <-  strsplit(listName, "\n")
+  listName <-  strsplit(listName, "\n")
 
-    for(i in seq(listName[[1]])) {
-        val <- listName[[1]][i]
-        #remove whitespaces from the string
-        val <-  gsub(" ", "", val)
-        #remove "/" symbol from the string
-        val <-  gsub("/", "", val)
-        # split the string from "="
-        val <-  strsplit(val, "=")
+  for(i in seq(listName[[1]])) {
+    val <- listName[[1]][i]
+    #remove whitespaces from the string
+    val <-  gsub(" ", "", val)
+    #remove "/" symbol from the string
+    val <-  gsub("/", "", val)
+    # split the string from "="
+    val <-  strsplit(val, "=")
 
-        if(feat %in% val[[1]][1]) {
-            # 2nd element of the list contains the mol.wt
-            return(as.numeric(val[[1]][2]))
-            mol_wt <-  mol_wt+1
-        }
+    if(feat %in% val[[1]][1]) {
+      # 2nd element of the list contains the mol.wt
+      return(as.numeric(val[[1]][2]))
+      mol_wt <-  mol_wt+1
     }
-    # Defensive Programming
-    # if the loop reaches the last entry of the list and couldn´t find the 'feat', return 0
-    if(i == length(listName[[1]]) & mol_wt == 0) {
-      return(0)
-      }
+  }
+  # Defensive Programming
+  # if the loop reaches the last entry of the list and couldn´t find the 'feat', return 0
+  if(i == length(listName[[1]]) & mol_wt == 0) {
+    return(0)
+  }
 }
 
diff --git a/man/refseq_XPlength.Rd b/man/refseq_XPlength.Rd
diff --git a/man/refseq_geneSymbol.Rd b/man/refseq_geneSymbol.Rd
diff --git a/vignettes/refseqR.Rmd b/vignettes/refseqR.Rmd
@@ -13,6 +13,7 @@ knitr::opts_chunk$set(
   comment = "#>"
 )
 ```
+
 Load the library.
 ```{r setup}
 library(refseqR)
@@ -68,7 +69,7 @@ strsplit(mrna_gb, "\n")[[1]][1:30]
 
 The `refseq_fromXM` function serves as a wrapper built on top of 'entrez_summary' from the 'rentrez' package, designed to extract specific features from the obtained data. Typically, my focus lies on key features like id, accession, title, update, or sequence length (bp). However, you have the flexibility to tailor the function to extract additional features of interest from the `esummary_list` object.
 
-```{r}
+```{r, eval = F}
 xm = c("XM_004487701", "XM_004488493", "XM_004501904")
 feat = c("caption", "moltype", "sourcedb", "slen", "title")
 refseq_fromXM(xm ,feat)
@@ -114,7 +115,7 @@ refseq_XMfromXP(xp)
 
 Two specific functions prove useful for managing protein accessions: `refseq_XPlength` offers the amino acid length of the sequence, while `refseq_mol.wt` provides the molecular weight in Daltons.
 ```{r}
-refseq_XPlength(xp)
+refseq_XPlength(xp, retries = 3)
 refseq_mol_wt(xp)
 ```
 
@@ -168,4 +169,3 @@ I'd really appreciate your feedback. The whole code used in this tutorial is ava
 &nbsp;
 
 Córdoba, (Spain), `r Sys.Date()`.
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -39,3 +39,4 @@ refseq_AAseq <- function(xp) {
		my_aa

		}
-Original file line number
+Diff line change
@@ Expand Up @@
         }
       })
       return(ncbi)
-      }
+    }