Merge pull request #6 from stephenturner/dev

support medrxiv
stephenturner · Sep 25, 2024 · 20dcdd9 · 20dcdd9
2 parents 64d927a + 75534f6
commit 20dcdd9
Show file tree

Hide file tree

Showing 21 changed files with 320 additions and 171 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,10 +1,10 @@
 Package: biorecap
-Title: Retrieve and summarize bioRxiv preprints with a local LLM using ollama
-Version: 0.1.1
+Title: Retrieve and summarize bioRxiv and medRxiv preprints with a local LLM using ollama
+Version: 0.2.0
 Authors@R: 
     person("Stephen", "Turner", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "0000-0001-9140-9028"))
-Description: Retrieve and summarize bioRxiv preprints with a local LLM using ollama.
+Description: Retrieve and summarize bioRxiv and medRxiv preprints with a local LLM using ollama.
 License: MIT + file LICENSE
 URL: https://stephenturner.github.io/biorecap/
 BugReports: https://github.com/stephenturner/biorecap/issues

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,10 @@
+# biorecap 0.2.0
+
+- Added medRxiv support. The `get_preprints()` function will now pull from either the bioRxiv or medRxiv RSS feed depending on the subject passed to it. All downstream functions and reporting updated to reflect this change (fixes #5).
+- Changed default model to llama 3.2 3B.
+- Added new source column for the returned preprints indicating whether the preprint came from bioRxiv or medRxiv.
+- Updated tests.
+
 # biorecap 0.1.1
 
 - Fix bug in `add_summary()` caused by upstream changes in ollamar (fixes #1).

diff --git a/R/biorecap.R b/R/biorecap.R
@@ -44,7 +44,7 @@ build_prompt_subject <- function(subject,
                                  title,
                                  summary,
                                  nsentences=5L,
-                                 instructions=c("I am giving you information about preprints published in bioRxiv recently.",
+                                 instructions=c("I am giving you information about recent bioRxiv/medRxiv preprints.",
                                                 "I'll give you the subject, preprint titles, and short summary of each paper.",
                                                 "Please provide a general summary new advances in this subject/field in general.",
                                                 "Provide this summary of the field in as many sentences as I instruct.",
@@ -57,31 +57,52 @@ build_prompt_subject <- function(subject,
   return(prompt)
 }
 
-#' Get bioRxiv preprints
+#' Get bioRxiv/medRxiv preprints
 #'
-#' @param subject A character vector of valid biorxiv subjects. See [subjects].
-#' @param baseurl The base URL for the biorxiv RSS feed. Default is `https://connect.biorxiv.org/biorxiv_xml.php?subject=`. Do not change unless you know what you are doing.
+#' @param subject A character vector of valid bioRxiv and/or medRxiv subjects. See [subjects].
 #' @param clean Logical; try to strip out graphical abstract information? If TRUE, this strips away any text between `O_FIG` and `C_FIG`, and the words `graphical abstract` from the abstract text in the RSS feed.
 #'
-#' @return A data frame of bioRxiv preprints.
+#' @return A data frame of preprints from bioRxiv and/or medRxiv.
 #' @export
 #'
 #' @examples
-#' preprints <- get_preprints(subject=c("bioinformatics", "genomics"))
+#' preprints <- get_preprints(subject=c("bioinformatics", "Public_and_Global_Health"))
 #' preprints
 #'
-get_preprints <- function(subject="all", baseurl="https://connect.biorxiv.org/biorxiv_xml.php?subject=", clean=TRUE) {
+get_preprints <- function(subject="all", clean=TRUE) {
 
+  subject <- tolower(subject)
   stopifnot(is.character(subject))
-  if (any(!subject %in% biorecap::subjects)) stop("Invalid subject. See ?subjects for valid choices.")
+  if (any(!subject %in% as.vector(unlist(biorecap::subjects)))) stop("Invalid subject. See ?subjects for valid choices.")
 
-  preprints <-
-    lapply(subject, \(x) suppressMessages(preprints <- tidyRSS::tidyfeed(paste0(baseurl, x)))) |>
-    stats::setNames(subject) |>
-    dplyr::bind_rows(.id="subject") |>
-    dplyr::select("subject", title="item_title", url="item_link", abstract="item_description") |>
-    dplyr::mutate(dplyr::across(dplyr::everything(), trimws))
-  if (nrow(preprints)<1L) stop("Something went wrong. No papers found for subject ", subject) #nocov
+  preprints <- list()
+
+  subject_bio <- subject[subject %in% biorecap::subjects$biorxiv]
+  if (length(subject_bio)>0) {
+    preprints$bio <-
+      lapply(subject_bio, \(x) suppressMessages(preprints <- tidyRSS::tidyfeed(paste0("https://connect.biorxiv.org/biorxiv_xml.php?subject=", x)))) |>
+      stats::setNames(subject_bio) |>
+      dplyr::bind_rows(.id="subject") |>
+      dplyr::select("subject", title="item_title", url="item_link", abstract="item_description") |>
+      dplyr::mutate(dplyr::across(dplyr::everything(), trimws)) |>
+      dplyr::mutate("source"="bioRxiv", .before=1)
+    if (nrow(preprints$bio)<1L) stop("Something went wrong. No papers found for subject ", subject) #nocov
+  }
+
+
+  subject_med <- subject[subject %in% biorecap::subjects$medrxiv]
+  if (length(subject_med)>0) {
+    preprints$med <-
+      lapply(subject_med, \(x) suppressMessages(preprints <- tidyRSS::tidyfeed(paste0("https://connect.medrxiv.org/medrxiv_xml.php?subject=", x)))) |>
+      stats::setNames(subject_med) |>
+      dplyr::bind_rows(.id="subject") |>
+      dplyr::select("subject", title="item_title", url="item_link", abstract="item_description") |>
+      dplyr::mutate(dplyr::across(dplyr::everything(), trimws)) |>
+      dplyr::mutate("source"="medRxiv", .before=1)
+    if (nrow(preprints$med)<1L) stop("Something went wrong. No papers found for subject ", subject) #nocov
+  }
+
+  preprints <- dplyr::bind_rows(preprints)
 
   if (clean) {
     preprints <-
@@ -101,7 +122,7 @@ get_preprints <- function(subject="all", baseurl="https://connect.biorxiv.org/bi
 #'
 #' @seealso [build_prompt_preprint()]
 #'
-#' @return A data frame of bioRxiv preprints with a prompt added.
+#' @return A data frame of preprints with a prompt added.
 #' @export
 #'
 #' @examples
@@ -144,7 +165,7 @@ add_prompt <- function(preprints, ...) {
 #' preprints
 #' }
 #'
-add_summary <- function(preprints, model="llama3.1") {
+add_summary <- function(preprints, model="llama3.2") {
 
   if (!inherits(preprints, "preprints_prompt")) warning("Expecting a tibble of class 'preprints_prompt' returned from get_preprints() |> add_prompt().")
   if (!inherits(preprints, "data.frame")) stop("Expecting a data frame.")
@@ -222,13 +243,13 @@ tt_preprints <- function(preprints, cols=c("title", "summary"), width=c(1,3)) {
 }
 
 
-#' Create a report from bioRxiv preprints
+#' Create a report from bioRxiv/medRxiv preprints
 #'
 #' @param output_dir Directory to save the report.
 #' @param subject Character vector of subjects to include in the report.
 #' @param nsentences Number of sentences to summarize each paper in.
 #' @param model The model to use for generating summaries. See [ollamar::list_models()].
-#' @param use_example_preprints Use the example preprints data included with the package instead of fetching new data from bioRxiv. For diagnostic/testing purposes only.
+#' @param use_example_preprints Use the example preprints data included with the package instead of fetching new data from bioRxiv/medRxiv. For diagnostic/testing purposes only.
 #' @param ... Other arguments passed to [rmarkdown::render()].
 #'
 #' @return Nothing; called for its side effects to produce a report.

diff --git a/R/data.R b/R/data.R
@@ -4,7 +4,7 @@
 #'
 #' @references <https://www.biorxiv.org/alertsrss>
 #'
-#' @format A character vector
+#' @format A list of character vectors of subjects, one for bioRxiv, one for medRxiv.
 #'
 #' @examples
 #' subjects

diff --git a/README.Rmd b/README.Rmd
@@ -28,7 +28,7 @@ knitr::opts_chunk$set(
 [![biorecap-r-universe](https://stephenturner.r-universe.dev/badges/biorecap)](https://stephenturner.r-universe.dev/biorecap)
 <!-- badges: end -->
 
-Retrieve and summarize [bioRxiv](https://www.biorxiv.org/) preprints using a local LLM with [Ollama](https://ollama.com/) via [ollamar](https://cran.r-project.org/package=ollamar). 
+Retrieve and summarize [bioRxiv](https://www.biorxiv.org/) and [medRxiv](https://www.medrxiv.org/) preprints using a local LLM with [Ollama](https://ollama.com/) via [ollamar](https://cran.r-project.org/package=ollamar). 
 
 Turner, S. D. (2024). biorecap: an R package for summarizing bioRxiv preprints with a local LLM. _arXiv_, 2408.11707. https://doi.org/10.48550/arXiv.2408.11707. 
 
@@ -73,43 +73,44 @@ list_models()
 ```
 
 ```
-#> # A tibble: 3 × 4
-#>   name            model           parameter_size quantization_level
-#>   <chr>           <chr>           <chr>          <chr>             
-#> 1 gemma2:latest   gemma2:latest   9.2B           Q4_0              
-#> 2 llama3.1:latest llama3.1:latest 8.0B           Q4_0              
-#> 3 llama3.1:70b    llama3.1:70b    70.6B          Q4_0           
+             name   size parameter_size quantization_level            modified
+1   gemma2:latest 5.4 GB           9.2B               Q4_0 2024-08-07T07:35:15
+3    llama3.1:70b  40 GB          70.6B               Q4_0 2024-07-24T10:57:08
+4 llama3.1:latest 4.7 GB           8.0B               Q4_0 2024-07-31T09:38:38
+5 llama3.2:latest   2 GB           3.2B             Q4_K_M 2024-09-25T14:54:23
+6     phi3:latest 2.2 GB           3.8B               Q4_0 2024-08-28T04:37:58      
 ```
 
-Write an HTML report containing summaries of recent preprints in select subject areas to the current working directory.
+Write an HTML report containing summaries of recent preprints in select subject areas to the current working directory. You can include both bioRxiv and medRxiv subjects, and biorecap will know which RSS feed to use.
 
 ```{r, eval=FALSE}
 biorecap_report(output_dir=".", 
-                subject=c("bioinformatics", "genomics", "synthetic_biology"), 
+                subject=c("bioinformatics", "infectious_diseases"), 
                 model="llama3.1")
 ```
 
-Example HTML report generated from the bioRxiv RSS feed on August 6, 2024:
+Example HTML report generated from bioRxiv (bioinformatics) and infectious diseases (medRxiv) subjects on September 25, 2024:
 
 ```{r, echo=FALSE}
-knitr::include_graphics(here::here("man/figures/report_screenshot.png"))
+knitr::include_graphics(here::here("man/figures/report_screenshot.jpg"))
 ```
 
 
 ### Details
 
-The `get_preprints()` function retrieves preprints from bioRxiv's RSS feeds. You pass one or more subjects to the `subject` argument. 
+The `get_preprints()` function retrieves preprints from the RSS feed of either bioRxiv or medRxiv, based on the subject you provided. You pass one or more subjects to the `subject` argument. 
 
 ```{r, eval=FALSE}
 pp <- get_preprints(subject=c("bioinformatics", 
-                              "genomics", 
-                              "synthetic_biology"))
-pp
+                              "infectious_diseases"))
+head(pp)
+tail(pp)
 ```
 
 ```{r, echo=FALSE}
 pp <- example_preprints
-pp |> dplyr::select(-prompt, -summary)
+pp |> dplyr::select(-prompt, -summary) |> head()
+pp |> dplyr::select(-prompt, -summary) |> tail()
 ```
 
 The `add_prompt()` function adds a prompt to each preprint that will be used to prompt the model.
@@ -137,9 +138,9 @@ The `add_summary()` function uses a locally running LLM available through Ollama
 
 ```{r, eval=FALSE}
 pp <- 
-  get_preprints(subject=c("bioinformatics", "genomics", "synthetic_biology")) |> 
+  get_preprints(subject=c("bioinformatics", "infectious_diseases")) |> 
   add_prompt() |> 
-  add_summary(model="llama3.1")
+  add_summary(model="llama3.2")
 ```
 
 Let's take a look at the results:
@@ -156,20 +157,21 @@ The `biorecap_report()` function runs this code in an RMarkdown template, writin
 
 ```{r, eval=FALSE}
 biorecap_report(output_dir=".", 
-                subject=c("bioinformatics", "genomics", "synthetic_biology"), 
-                model="llama3.1")
+                subject=c("bioinformatics", "infectious_diseases"), 
+                model="llama3.2")
 ```
 
-The built-in `subjects` vector contains all the available bioRxiv subjects.
+The built-in `subjects` is a list with vectors containing all the available bioRxiv and medRxiv subjects.
 
 ```{r}
-subjects
+subjects$biorxiv
+subjects$medrxiv
 ```
 
 You could create a report for _all_ subjects like this (note, this could take some time):
 
 ```{r, eval=FALSE}
 biorecap_report(output_dir=".", 
                 subject=subjects, 
-                model="llama3.1")
+                model="llama3.2")
 ```