Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
jdieramon committed May 16, 2024
2 parents f0baff6 + c2cabf0 commit 2ba8d39
Show file tree
Hide file tree
Showing 11 changed files with 89 additions and 53 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: refseqR
Type: Package
Title: Common Computational Operations Working with RefSeq Entries (GenBank)
Version: 1.0.1
Version: 1.0.2
Authors@R:
c(person("Jose V.", "Die", , "[email protected]", role = c("aut", "cre"),
comment = c(ORCID = "0000-0002-7506-8590")),
Expand Down
1 change: 1 addition & 0 deletions R/refseq_AAseq.R
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,4 @@ refseq_AAseq <- function(xp) {
my_aa

}

6 changes: 3 additions & 3 deletions R/refseq_CDSseq.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ refseq_CDSseq <- function(xm) {
res <- refseq_CDScoords(xm)
my_cds <- sapply(seq(res), function(i) {
cds <- rentrez::entrez_fetch(db = "nuccore", id = names(res)[i],
rettype = "fasta",
seq_start = start(res)[i],
seq_stop = end(res)[i])
rettype = "fasta",
seq_start = start(res)[i],
seq_stop = end(res)[i])
cds_tidy <- strsplit(cds, "\n")
cds_tidy <- as.character(paste0(cds_tidy[[1]][2:length(cds_tidy[[1]])], collapse = ""))

Expand Down
26 changes: 20 additions & 6 deletions R/refseq_XPlength.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
#' @description \code{refseq_XPlength()} Returns the amino acid length from a single XP accession.
#'
#' @usage
#' refseq_XPlength(xp)
#' refseq_XPlength(xp, retries)
#'
#' @param xp A character string of the XP id.
#' @param retries A numeric value to control the number of retry attempts to handle internet errors.
#'
#'
#' @returns A numeric value representing the aa length of the protein especified as `xp`.
#'
Expand All @@ -14,16 +16,28 @@
#' @examples
#' # Get the XM ids from a set of XP accessions
#' xp = c("XP_004487758", "XP_004488550")
#' sapply(xp, function(x) refseq_XPlength(x), USE.NAMES = FALSE)
#' sapply(xp, function(x) refseq_XPlength(x, retries = 3), USE.NAMES = FALSE)
#'
#' @author Jose V. Die
#'
#' @export

refseq_XPlength <- function(xp) {
# Get the item list for that protein id
xpinfo <- rentrez::entrez_summary(db = "protein", id = xp)
refseq_XPlength <- function(xp, retries = 3) {
tryCatch({
# Get the item list for that protein id
xpinfo <- rentrez::entrez_summary(db = "protein", id = xp)
}, error = function(e){
if (inherits(e, "error")) {
if (grepl("HTTP error: 502", e$message) && retries > 0) {
message("Retrying...\n")
Sys.sleep(5) # Wait for 5 seconds before retrying
return(refseq_XPlength(xp, retries - 1))
} else {
stop(e)
}
}
})
# Get the protein length
xpinfo$slen
return(xpinfo$slen)
}

2 changes: 1 addition & 1 deletion R/refseq_fromGene.R
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,4 @@ refseq_fromGene <- function(gene_symbol , sequence = "XM", retries = 3) {
}
})
return(ncbi)
}
}
4 changes: 2 additions & 2 deletions R/refseq_fromXM.R
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,8 @@ refseq_fromXM <- function(xm, feat) {

# Build dataframe
df <- data.frame(matrix(unlist(mrna), nrow = length(mrna)/length(feat),
byrow = T),
stringsAsFactors = F)
byrow = T),
stringsAsFactors = F)

colnames(df) = feat

Expand Down
43 changes: 30 additions & 13 deletions R/refseq_geneSymbol.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,11 @@
#' @description \code{refseq_geneSymbol()} Returns the gene symbol from a single Gene id. accession.
#'
#' @usage
#' refseq_geneSymbol (id, db)
#' refseq_geneSymbol (id, db, retries)
#'
#' @param id A character string of the XP or XM id.
#' @param db A character string of the "nuccore" or "protein" database.
#' @param retries A numeric value to control the number of retry attempts to handle internet errors.
#'
#' @returns A character vector containing the gene symbol corresponding to the especified accession as `id`.
#'
Expand All @@ -16,27 +17,43 @@
#' @examples
#' # Get the gene symbol from a set of XM accessions
#' xm = c("XM_004487701", "XM_004488493")
#' sapply(xm, function(x) refseq_geneSymbol (x, db = "nuccore"), USE.NAMES = FALSE)
#' sapply(xm, function(x) refseq_geneSymbol (x, db = "nuccore", retries = 3), USE.NAMES = FALSE)
#'
#' # Get the gene symbol from a set of XP accessions
#' xp = c("XP_004487758")
#' sapply(xp, function(x) refseq_geneSymbol (x, db = "protein"), USE.NAMES = FALSE)
#' sapply(xp, function(x) refseq_geneSymbol (x, db = "protein", retries = 3), USE.NAMES = FALSE)
#'
#' @author Jose V. Die
#'
#' @export

refseq_geneSymbol <- function(id, db = "protein") {
if (db == "protein") {
id_elink = rentrez::entrez_link(dbfrom = "protein", id = id, db= "gene")
gene_id = id_elink$links$protein_gene
} else {
id_elink = rentrez::entrez_link(dbfrom = "nuccore", id = id, db= "gene")
gene_id = id_elink$links$nuccore_gene
}
refseq_geneSymbol <- function(id, db = "protein", retries = 3) {

tryCatch({

if (db == "protein") {
id_elink = rentrez::entrez_link(dbfrom = "protein", id = id, db= "gene")
gene_id = id_elink$links$protein_gene

} else {
id_elink = rentrez::entrez_link(dbfrom = "nuccore", id = id, db= "gene")
gene_id = id_elink$links$nuccore_gene
}

}, error = function(e) {
if (inherits(e, "error")) {
if (grepl("HTTP error: 502", e$message) && retries > 0) {
message("Retrying...\n")
Sys.sleep(5) # Wait for 5 seconds before retrying
return(refseq_geneSymbol(id, db, retries - 1))
} else {
stop(e)
}

}
})

gene_summ = rentrez::entrez_summary(db = "gene", id = gene_id)
gene_summ$name
return(gene_summ$name)

}

38 changes: 19 additions & 19 deletions R/refseq_mol_wt.R
Original file line number Diff line number Diff line change
Expand Up @@ -38,27 +38,27 @@ refseq_mol_wt <- function(xp) {

mol_wt <- 0 # keep track of success

listName <- strsplit(listName, "\n")
listName <- strsplit(listName, "\n")

for(i in seq(listName[[1]])) {
val <- listName[[1]][i]
#remove whitespaces from the string
val <- gsub(" ", "", val)
#remove "/" symbol from the string
val <- gsub("/", "", val)
# split the string from "="
val <- strsplit(val, "=")
for(i in seq(listName[[1]])) {
val <- listName[[1]][i]
#remove whitespaces from the string
val <- gsub(" ", "", val)
#remove "/" symbol from the string
val <- gsub("/", "", val)
# split the string from "="
val <- strsplit(val, "=")

if(feat %in% val[[1]][1]) {
# 2nd element of the list contains the mol.wt
return(as.numeric(val[[1]][2]))
mol_wt <- mol_wt+1
}
if(feat %in% val[[1]][1]) {
# 2nd element of the list contains the mol.wt
return(as.numeric(val[[1]][2]))
mol_wt <- mol_wt+1
}
# Defensive Programming
# if the loop reaches the last entry of the list and couldn´t find the 'feat', return 0
if(i == length(listName[[1]]) & mol_wt == 0) {
return(0)
}
}
# Defensive Programming
# if the loop reaches the last entry of the list and couldn´t find the 'feat', return 0
if(i == length(listName[[1]]) & mol_wt == 0) {
return(0)
}
}

6 changes: 4 additions & 2 deletions man/refseq_XPlength.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 5 additions & 3 deletions man/refseq_geneSymbol.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions vignettes/refseqR.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ knitr::opts_chunk$set(
comment = "#>"
)
```

Load the library.
```{r setup}
library(refseqR)
Expand Down Expand Up @@ -68,7 +69,7 @@ strsplit(mrna_gb, "\n")[[1]][1:30]

The `refseq_fromXM` function serves as a wrapper built on top of 'entrez_summary' from the 'rentrez' package, designed to extract specific features from the obtained data. Typically, my focus lies on key features like id, accession, title, update, or sequence length (bp). However, you have the flexibility to tailor the function to extract additional features of interest from the `esummary_list` object.

```{r}
```{r, eval = F}
xm = c("XM_004487701", "XM_004488493", "XM_004501904")
feat = c("caption", "moltype", "sourcedb", "slen", "title")
refseq_fromXM(xm ,feat)
Expand Down Expand Up @@ -114,7 +115,7 @@ refseq_XMfromXP(xp)

Two specific functions prove useful for managing protein accessions: `refseq_XPlength` offers the amino acid length of the sequence, while `refseq_mol.wt` provides the molecular weight in Daltons.
```{r}
refseq_XPlength(xp)
refseq_XPlength(xp, retries = 3)
refseq_mol_wt(xp)
```

Expand Down Expand Up @@ -168,4 +169,3 @@ I'd really appreciate your feedback. The whole code used in this tutorial is ava
&nbsp;

Córdoba, (Spain), `r Sys.Date()`.

0 comments on commit 2ba8d39

Please sign in to comment.