diff --git a/DESCRIPTION b/DESCRIPTION index 92d8360..68fc780 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,10 +1,10 @@ Package: biorecap -Title: Retrieve and summarize bioRxiv preprints with a local LLM using ollama -Version: 0.1.1 +Title: Retrieve and summarize bioRxiv and medRxiv preprints with a local LLM using ollama +Version: 0.2.0 Authors@R: person("Stephen", "Turner", , "vustephen@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0001-9140-9028")) -Description: Retrieve and summarize bioRxiv preprints with a local LLM using ollama. +Description: Retrieve and summarize bioRxiv and medRxiv preprints with a local LLM using ollama. License: MIT + file LICENSE URL: https://stephenturner.github.io/biorecap/ BugReports: https://github.com/stephenturner/biorecap/issues diff --git a/NEWS.md b/NEWS.md index 2aa129e..60c3711 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# biorecap 0.2.0 + +- Added medRxiv support. The `get_preprints()` function will now pull from either the bioRxiv or medRxiv RSS feed depending on the subject passed to it. All downstream functions and reporting updated to reflect this change (fixes #5). +- Changed default model to llama 3.2 3B. +- Added new source column for the returned preprints indicating whether the preprint came from bioRxiv or medRxiv. +- Updated tests. + # biorecap 0.1.1 - Fix bug in `add_summary()` caused by upstream changes in ollamar (fixes #1). diff --git a/R/biorecap.R b/R/biorecap.R index cd0c827..2c9e908 100644 --- a/R/biorecap.R +++ b/R/biorecap.R @@ -44,7 +44,7 @@ build_prompt_subject <- function(subject, title, summary, nsentences=5L, - instructions=c("I am giving you information about preprints published in bioRxiv recently.", + instructions=c("I am giving you information about recent bioRxiv/medRxiv preprints.", "I'll give you the subject, preprint titles, and short summary of each paper.", "Please provide a general summary new advances in this subject/field in general.", "Provide this summary of the field in as many sentences as I instruct.", @@ -57,31 +57,52 @@ build_prompt_subject <- function(subject, return(prompt) } -#' Get bioRxiv preprints +#' Get bioRxiv/medRxiv preprints #' -#' @param subject A character vector of valid biorxiv subjects. See [subjects]. -#' @param baseurl The base URL for the biorxiv RSS feed. Default is `https://connect.biorxiv.org/biorxiv_xml.php?subject=`. Do not change unless you know what you are doing. +#' @param subject A character vector of valid bioRxiv and/or medRxiv subjects. See [subjects]. #' @param clean Logical; try to strip out graphical abstract information? If TRUE, this strips away any text between `O_FIG` and `C_FIG`, and the words `graphical abstract` from the abstract text in the RSS feed. #' -#' @return A data frame of bioRxiv preprints. +#' @return A data frame of preprints from bioRxiv and/or medRxiv. #' @export #' #' @examples -#' preprints <- get_preprints(subject=c("bioinformatics", "genomics")) +#' preprints <- get_preprints(subject=c("bioinformatics", "Public_and_Global_Health")) #' preprints #' -get_preprints <- function(subject="all", baseurl="https://connect.biorxiv.org/biorxiv_xml.php?subject=", clean=TRUE) { +get_preprints <- function(subject="all", clean=TRUE) { + subject <- tolower(subject) stopifnot(is.character(subject)) - if (any(!subject %in% biorecap::subjects)) stop("Invalid subject. See ?subjects for valid choices.") + if (any(!subject %in% as.vector(unlist(biorecap::subjects)))) stop("Invalid subject. See ?subjects for valid choices.") - preprints <- - lapply(subject, \(x) suppressMessages(preprints <- tidyRSS::tidyfeed(paste0(baseurl, x)))) |> - stats::setNames(subject) |> - dplyr::bind_rows(.id="subject") |> - dplyr::select("subject", title="item_title", url="item_link", abstract="item_description") |> - dplyr::mutate(dplyr::across(dplyr::everything(), trimws)) - if (nrow(preprints)<1L) stop("Something went wrong. No papers found for subject ", subject) #nocov + preprints <- list() + + subject_bio <- subject[subject %in% biorecap::subjects$biorxiv] + if (length(subject_bio)>0) { + preprints$bio <- + lapply(subject_bio, \(x) suppressMessages(preprints <- tidyRSS::tidyfeed(paste0("https://connect.biorxiv.org/biorxiv_xml.php?subject=", x)))) |> + stats::setNames(subject_bio) |> + dplyr::bind_rows(.id="subject") |> + dplyr::select("subject", title="item_title", url="item_link", abstract="item_description") |> + dplyr::mutate(dplyr::across(dplyr::everything(), trimws)) |> + dplyr::mutate("source"="bioRxiv", .before=1) + if (nrow(preprints$bio)<1L) stop("Something went wrong. No papers found for subject ", subject) #nocov + } + + + subject_med <- subject[subject %in% biorecap::subjects$medrxiv] + if (length(subject_med)>0) { + preprints$med <- + lapply(subject_med, \(x) suppressMessages(preprints <- tidyRSS::tidyfeed(paste0("https://connect.medrxiv.org/medrxiv_xml.php?subject=", x)))) |> + stats::setNames(subject_med) |> + dplyr::bind_rows(.id="subject") |> + dplyr::select("subject", title="item_title", url="item_link", abstract="item_description") |> + dplyr::mutate(dplyr::across(dplyr::everything(), trimws)) |> + dplyr::mutate("source"="medRxiv", .before=1) + if (nrow(preprints$med)<1L) stop("Something went wrong. No papers found for subject ", subject) #nocov + } + + preprints <- dplyr::bind_rows(preprints) if (clean) { preprints <- @@ -101,7 +122,7 @@ get_preprints <- function(subject="all", baseurl="https://connect.biorxiv.org/bi #' #' @seealso [build_prompt_preprint()] #' -#' @return A data frame of bioRxiv preprints with a prompt added. +#' @return A data frame of preprints with a prompt added. #' @export #' #' @examples @@ -144,7 +165,7 @@ add_prompt <- function(preprints, ...) { #' preprints #' } #' -add_summary <- function(preprints, model="llama3.1") { +add_summary <- function(preprints, model="llama3.2") { if (!inherits(preprints, "preprints_prompt")) warning("Expecting a tibble of class 'preprints_prompt' returned from get_preprints() |> add_prompt().") if (!inherits(preprints, "data.frame")) stop("Expecting a data frame.") @@ -222,13 +243,13 @@ tt_preprints <- function(preprints, cols=c("title", "summary"), width=c(1,3)) { } -#' Create a report from bioRxiv preprints +#' Create a report from bioRxiv/medRxiv preprints #' #' @param output_dir Directory to save the report. #' @param subject Character vector of subjects to include in the report. #' @param nsentences Number of sentences to summarize each paper in. #' @param model The model to use for generating summaries. See [ollamar::list_models()]. -#' @param use_example_preprints Use the example preprints data included with the package instead of fetching new data from bioRxiv. For diagnostic/testing purposes only. +#' @param use_example_preprints Use the example preprints data included with the package instead of fetching new data from bioRxiv/medRxiv. For diagnostic/testing purposes only. #' @param ... Other arguments passed to [rmarkdown::render()]. #' #' @return Nothing; called for its side effects to produce a report. diff --git a/R/data.R b/R/data.R index 9967548..389d838 100644 --- a/R/data.R +++ b/R/data.R @@ -4,7 +4,7 @@ #' #' @references #' -#' @format A character vector +#' @format A list of character vectors of subjects, one for bioRxiv, one for medRxiv. #' #' @examples #' subjects diff --git a/README.Rmd b/README.Rmd index f238f83..8a16e04 100644 --- a/README.Rmd +++ b/README.Rmd @@ -28,7 +28,7 @@ knitr::opts_chunk$set( [![biorecap-r-universe](https://stephenturner.r-universe.dev/badges/biorecap)](https://stephenturner.r-universe.dev/biorecap) -Retrieve and summarize [bioRxiv](https://www.biorxiv.org/) preprints using a local LLM with [Ollama](https://ollama.com/) via [ollamar](https://cran.r-project.org/package=ollamar). +Retrieve and summarize [bioRxiv](https://www.biorxiv.org/) and [medRxiv](https://www.medrxiv.org/) preprints using a local LLM with [Ollama](https://ollama.com/) via [ollamar](https://cran.r-project.org/package=ollamar). Turner, S. D. (2024). biorecap: an R package for summarizing bioRxiv preprints with a local LLM. _arXiv_, 2408.11707. https://doi.org/10.48550/arXiv.2408.11707. @@ -73,43 +73,44 @@ list_models() ``` ``` -#> # A tibble: 3 × 4 -#> name model parameter_size quantization_level -#> -#> 1 gemma2:latest gemma2:latest 9.2B Q4_0 -#> 2 llama3.1:latest llama3.1:latest 8.0B Q4_0 -#> 3 llama3.1:70b llama3.1:70b 70.6B Q4_0 + name size parameter_size quantization_level modified +1 gemma2:latest 5.4 GB 9.2B Q4_0 2024-08-07T07:35:15 +3 llama3.1:70b 40 GB 70.6B Q4_0 2024-07-24T10:57:08 +4 llama3.1:latest 4.7 GB 8.0B Q4_0 2024-07-31T09:38:38 +5 llama3.2:latest 2 GB 3.2B Q4_K_M 2024-09-25T14:54:23 +6 phi3:latest 2.2 GB 3.8B Q4_0 2024-08-28T04:37:58 ``` -Write an HTML report containing summaries of recent preprints in select subject areas to the current working directory. +Write an HTML report containing summaries of recent preprints in select subject areas to the current working directory. You can include both bioRxiv and medRxiv subjects, and biorecap will know which RSS feed to use. ```{r, eval=FALSE} biorecap_report(output_dir=".", - subject=c("bioinformatics", "genomics", "synthetic_biology"), + subject=c("bioinformatics", "infectious_diseases"), model="llama3.1") ``` -Example HTML report generated from the bioRxiv RSS feed on August 6, 2024: +Example HTML report generated from bioRxiv (bioinformatics) and infectious diseases (medRxiv) subjects on September 25, 2024: ```{r, echo=FALSE} -knitr::include_graphics(here::here("man/figures/report_screenshot.png")) +knitr::include_graphics(here::here("man/figures/report_screenshot.jpg")) ``` ### Details -The `get_preprints()` function retrieves preprints from bioRxiv's RSS feeds. You pass one or more subjects to the `subject` argument. +The `get_preprints()` function retrieves preprints from the RSS feed of either bioRxiv or medRxiv, based on the subject you provided. You pass one or more subjects to the `subject` argument. ```{r, eval=FALSE} pp <- get_preprints(subject=c("bioinformatics", - "genomics", - "synthetic_biology")) -pp + "infectious_diseases")) +head(pp) +tail(pp) ``` ```{r, echo=FALSE} pp <- example_preprints -pp |> dplyr::select(-prompt, -summary) +pp |> dplyr::select(-prompt, -summary) |> head() +pp |> dplyr::select(-prompt, -summary) |> tail() ``` The `add_prompt()` function adds a prompt to each preprint that will be used to prompt the model. @@ -137,9 +138,9 @@ The `add_summary()` function uses a locally running LLM available through Ollama ```{r, eval=FALSE} pp <- - get_preprints(subject=c("bioinformatics", "genomics", "synthetic_biology")) |> + get_preprints(subject=c("bioinformatics", "infectious_diseases")) |> add_prompt() |> - add_summary(model="llama3.1") + add_summary(model="llama3.2") ``` Let's take a look at the results: @@ -156,14 +157,15 @@ The `biorecap_report()` function runs this code in an RMarkdown template, writin ```{r, eval=FALSE} biorecap_report(output_dir=".", - subject=c("bioinformatics", "genomics", "synthetic_biology"), - model="llama3.1") + subject=c("bioinformatics", "infectious_diseases"), + model="llama3.2") ``` -The built-in `subjects` vector contains all the available bioRxiv subjects. +The built-in `subjects` is a list with vectors containing all the available bioRxiv and medRxiv subjects. ```{r} -subjects +subjects$biorxiv +subjects$medrxiv ``` You could create a report for _all_ subjects like this (note, this could take some time): @@ -171,5 +173,5 @@ You could create a report for _all_ subjects like this (note, this could take so ```{r, eval=FALSE} biorecap_report(output_dir=".", subject=subjects, - model="llama3.1") + model="llama3.2") ``` diff --git a/README.md b/README.md index 073c385..7fb1d02 100644 --- a/README.md +++ b/README.md @@ -10,8 +10,9 @@ [![biorecap-r-universe](https://stephenturner.r-universe.dev/badges/biorecap)](https://stephenturner.r-universe.dev/biorecap) -Retrieve and summarize [bioRxiv](https://www.biorxiv.org/) preprints -using a local LLM with [Ollama](https://ollama.com/) via +Retrieve and summarize [bioRxiv](https://www.biorxiv.org/) and +[medRxiv](https://www.medrxiv.org/) preprints using a local LLM with +[Ollama](https://ollama.com/) via [ollamar](https://cran.r-project.org/package=ollamar). Turner, S. D. (2024). biorecap: an R package for summarizing bioRxiv @@ -57,53 +58,60 @@ Next we can list our available models: list_models() ``` - #> # A tibble: 3 × 4 - #> name model parameter_size quantization_level - #> - #> 1 gemma2:latest gemma2:latest 9.2B Q4_0 - #> 2 llama3.1:latest llama3.1:latest 8.0B Q4_0 - #> 3 llama3.1:70b llama3.1:70b 70.6B Q4_0 + name size parameter_size quantization_level modified + 1 gemma2:latest 5.4 GB 9.2B Q4_0 2024-08-07T07:35:15 + 3 llama3.1:70b 40 GB 70.6B Q4_0 2024-07-24T10:57:08 + 4 llama3.1:latest 4.7 GB 8.0B Q4_0 2024-07-31T09:38:38 + 5 llama3.2:latest 2 GB 3.2B Q4_K_M 2024-09-25T14:54:23 + 6 phi3:latest 2.2 GB 3.8B Q4_0 2024-08-28T04:37:58 Write an HTML report containing summaries of recent preprints in select -subject areas to the current working directory. +subject areas to the current working directory. You can include both +bioRxiv and medRxiv subjects, and biorecap will know which RSS feed to +use. ``` r biorecap_report(output_dir=".", - subject=c("bioinformatics", "genomics", "synthetic_biology"), + subject=c("bioinformatics", "infectious_diseases"), model="llama3.1") ``` -Example HTML report generated from the bioRxiv RSS feed on August 6, -2024: +Example HTML report generated from bioRxiv (bioinformatics) and +infectious diseases (medRxiv) subjects on September 25, 2024: - + ### Details -The `get_preprints()` function retrieves preprints from bioRxiv’s RSS -feeds. You pass one or more subjects to the `subject` argument. +The `get_preprints()` function retrieves preprints from the RSS feed of +either bioRxiv or medRxiv, based on the subject you provided. You pass +one or more subjects to the `subject` argument. ``` r pp <- get_preprints(subject=c("bioinformatics", - "genomics", - "synthetic_biology")) -pp + "infectious_diseases")) +head(pp) +tail(pp) ``` - #> # A tibble: 90 × 4 - #> subject title url abstract - #> - #> 1 bioinformatics Integrity and miss grouping as support for clu… http… "The hi… - #> 2 bioinformatics Sainsc: a computational tool for segmentation-… http… "Spatia… - #> 3 bioinformatics BRACE: A novel Bayesian-based imputation appro… http… "Bayesi… - #> 4 bioinformatics Topological embedding and directional feature … http… "Cancer… - #> 5 bioinformatics SeuratExtend: Streamlining Single-Cell RNA-Seq… http… "Single… - #> 6 bioinformatics An Evolutionary Statistics Toolkit for Simplif… http… "We pre… - #> 7 bioinformatics A map of integrated cis-regulatory elements en… http… "Cis-re… - #> 8 bioinformatics MOSTPLAS: A Self-correction Multi-label Learni… http… "Plasmi… - #> 9 bioinformatics Bootstrap Evaluation of Association Matrices (… http… "Motiva… - #> 10 bioinformatics Thermodynamic modeling of Csr/Rsm- RNA interac… http… "Backgr… - #> # ℹ 80 more rows + #> # A tibble: 6 × 5 + #> source subject title url abstract + #> + #> 1 bioRxiv bioinformatics MedGraphNet: Leveraging Multi-Relationa… http… Genetic… + #> 2 bioRxiv bioinformatics High-throughput bacterial aggregation a… http… The com… + #> 3 bioRxiv bioinformatics scParadise: Tunable highly accurate mul… http… scRNA-s… + #> 4 bioRxiv bioinformatics Camera Paths, Modeling, and Image Proce… http… The enh… + #> 5 bioRxiv bioinformatics dScaff - an automatic bioinformatics fr… http… Rapid e… + #> 6 bioRxiv bioinformatics Jaeger: an accurate and fast deep-learn… http… Abstrac… + #> # A tibble: 6 × 5 + #> source subject title url abstract + #> + #> 1 medRxiv infectious_diseases Reactogenicity and immunogenicity … http… "The re… + #> 2 medRxiv infectious_diseases A next generation CRISPR diagnosti… http… "The WH… + #> 3 medRxiv infectious_diseases Hospital-onset bacteraemia and fun… http… "Backgr… + #> 4 medRxiv infectious_diseases Co-circulating pathogens of humans… http… "Histor… + #> 5 medRxiv infectious_diseases Integration of Group A Streptococc… http… "The Ca… + #> 6 medRxiv infectious_diseases Deep Learning Models for Predictin… http… "The Nu… The `add_prompt()` function adds a prompt to each preprint that will be used to prompt the model. @@ -113,20 +121,20 @@ pp <- pp |> add_prompt() pp ``` - #> # A tibble: 90 × 5 - #> subject title url abstract prompt - #> - #> 1 bioinformatics Integrity and miss grouping as support … http… "The hi… "I am… - #> 2 bioinformatics Sainsc: a computational tool for segmen… http… "Spatia… "I am… - #> 3 bioinformatics BRACE: A novel Bayesian-based imputatio… http… "Bayesi… "I am… - #> 4 bioinformatics Topological embedding and directional f… http… "Cancer… "I am… - #> 5 bioinformatics SeuratExtend: Streamlining Single-Cell … http… "Single… "I am… - #> 6 bioinformatics An Evolutionary Statistics Toolkit for … http… "We pre… "I am… - #> 7 bioinformatics A map of integrated cis-regulatory elem… http… "Cis-re… "I am… - #> 8 bioinformatics MOSTPLAS: A Self-correction Multi-label… http… "Plasmi… "I am… - #> 9 bioinformatics Bootstrap Evaluation of Association Mat… http… "Motiva… "I am… - #> 10 bioinformatics Thermodynamic modeling of Csr/Rsm- RNA … http… "Backgr… "I am… - #> # ℹ 80 more rows + #> # A tibble: 60 × 6 + #> source subject title url abstract prompt + #> + #> 1 bioRxiv bioinformatics MedGraphNet: Leveraging Multi-R… http… Genetic… I am … + #> 2 bioRxiv bioinformatics High-throughput bacterial aggre… http… The com… I am … + #> 3 bioRxiv bioinformatics scParadise: Tunable highly accu… http… scRNA-s… I am … + #> 4 bioRxiv bioinformatics Camera Paths, Modeling, and Ima… http… The enh… I am … + #> 5 bioRxiv bioinformatics dScaff - an automatic bioinform… http… Rapid e… I am … + #> 6 bioRxiv bioinformatics Jaeger: an accurate and fast de… http… Abstrac… I am … + #> 7 bioRxiv bioinformatics AI-Augmented R-Group Exploratio… http… Efficie… I am … + #> 8 bioRxiv bioinformatics OPLS-based Multiclass Classific… http… Multicl… I am … + #> 9 bioRxiv bioinformatics STANCE: a unified statistical m… http… A signi… I am … + #> 10 bioRxiv bioinformatics AsaruSim: a single-cell and spa… http… Motivat… I am … + #> # ℹ 50 more rows Let’s take a look at one of these prompts: @@ -170,29 +178,29 @@ that we can do this all in a single pipeline. This takes a few minutes! ``` r pp <- - get_preprints(subject=c("bioinformatics", "genomics", "synthetic_biology")) |> + get_preprints(subject=c("bioinformatics", "infectious_diseases")) |> add_prompt() |> - add_summary(model="llama3.1") + add_summary(model="llama3.2") ``` Let’s take a look at the results: ``` r pp -#> # A tibble: 90 × 6 -#> subject title url abstract prompt summary -#> -#> 1 bioinformatics Integrity and miss grouping as … http… "The hi… "I am… "The p… -#> 2 bioinformatics Sainsc: a computational tool fo… http… "Spatia… "I am… "Sains… -#> 3 bioinformatics BRACE: A novel Bayesian-based i… http… "Bayesi… "I am… "Alter… -#> 4 bioinformatics Topological embedding and direc… http… "Cancer… "I am… "Resea… -#> 5 bioinformatics SeuratExtend: Streamlining Sing… http… "Single… "I am… "Seura… -#> 6 bioinformatics An Evolutionary Statistics Tool… http… "We pre… "I am… "The \… -#> 7 bioinformatics A map of integrated cis-regulat… http… "Cis-re… "I am… "The a… -#> 8 bioinformatics MOSTPLAS: A Self-correction Mul… http… "Plasmi… "I am… "Plasm… -#> 9 bioinformatics Bootstrap Evaluation of Associa… http… "Motiva… "I am… "The a… -#> 10 bioinformatics Thermodynamic modeling of Csr/R… http… "Backgr… "I am… "Resea… -#> # ℹ 80 more rows +#> # A tibble: 60 × 7 +#> source subject title url abstract prompt summary +#> +#> 1 bioRxiv bioinformatics MedGraphNet: Leveraging… http… Genetic… I am … MedGra… +#> 2 bioRxiv bioinformatics High-throughput bacteri… http… The com… I am … The co… +#> 3 bioRxiv bioinformatics scParadise: Tunable hig… http… scRNA-s… I am … scAdam… +#> 4 bioRxiv bioinformatics Camera Paths, Modeling,… http… The enh… I am … ArtiaX… +#> 5 bioRxiv bioinformatics dScaff - an automatic b… http… Rapid e… I am … dScaff… +#> 6 bioRxiv bioinformatics Jaeger: an accurate and… http… Abstrac… I am … Jaeger… +#> 7 bioRxiv bioinformatics AI-Augmented R-Group Ex… http… Efficie… I am … The pa… +#> 8 bioRxiv bioinformatics OPLS-based Multiclass C… http… Multicl… I am … OPLS-D… +#> 9 bioRxiv bioinformatics STANCE: a unified stati… http… A signi… I am … STANCE… +#> 10 bioRxiv bioinformatics AsaruSim: a single-cell… http… Motivat… I am … AsaruS… +#> # ℹ 50 more rows ``` Let’s look at one of those summaries. Here’s the summary for the @@ -211,15 +219,15 @@ current working directory. ``` r biorecap_report(output_dir=".", - subject=c("bioinformatics", "genomics", "synthetic_biology"), - model="llama3.1") + subject=c("bioinformatics", "infectious_diseases"), + model="llama3.2") ``` -The built-in `subjects` vector contains all the available bioRxiv -subjects. +The built-in `subjects` is a list with vectors containing all the +available bioRxiv and medRxiv subjects. ``` r -subjects +subjects$biorxiv #> [1] "all" #> [2] "animal_behavior_and_cognition" #> [3] "biochemistry" @@ -247,6 +255,61 @@ subjects #> [25] "synthetic_biology" #> [26] "systems_biology" #> [27] "zoology" +subjects$medrxiv +#> [1] "all" +#> [2] "addiction_medicine" +#> [3] "allergy_and_immunology" +#> [4] "anesthesia" +#> [5] "cardiovascular_medicine" +#> [6] "dentistry_and_oral_medicine" +#> [7] "dermatology" +#> [8] "dermatology" +#> [9] "endocrinology" +#> [10] "epidemiology" +#> [11] "ecology" +#> [12] "epidemiology" +#> [13] "forensic_medicine" +#> [14] "gastroenterology" +#> [15] "genetic_and_genomic_medicine" +#> [16] "geriatric_medicine" +#> [17] "health_economics" +#> [18] "health_informatics" +#> [19] "health_policy" +#> [20] "health_systems_and_quality_improvement" +#> [21] "hematology" +#> [22] "hivaids" +#> [23] "infectious_diseases" +#> [24] "intensive_care_and_critical_care_medicine" +#> [25] "medical_education" +#> [26] "medical_ethics" +#> [27] "nephrology" +#> [28] "neurology" +#> [29] "nursing" +#> [30] "nutrition" +#> [31] "obstetrics_and_gynecology" +#> [32] "occupational_and_environmental_health" +#> [33] "oncology" +#> [34] "ophthalmology" +#> [35] "orthopedics" +#> [36] "otolaryngology" +#> [37] "pain_medicine" +#> [38] "palliative_medicine" +#> [39] "pathology" +#> [40] "pediatrics" +#> [41] "pharmacology_and_therapeutics" +#> [42] "primary_care_research" +#> [43] "psychiatry_and_clinical_psychology" +#> [44] "public_and_global_health" +#> [45] "radiology_and_imaging" +#> [46] "rehabilitation_medicine_and_physical_therapy" +#> [47] "respiratory_medicine" +#> [48] "rheumatology" +#> [49] "sexual_and_reproductive_health" +#> [50] "sports_medicine" +#> [51] "surgery" +#> [52] "toxicology" +#> [53] "transplantation" +#> [54] "urology" ``` You could create a report for *all* subjects like this (note, this could @@ -255,5 +318,5 @@ take some time): ``` r biorecap_report(output_dir=".", subject=subjects, - model="llama3.1") + model="llama3.2") ``` diff --git a/data-raw/biorecap_data.R b/data-raw/biorecap_data.R index 6fe83c4..d6d673b 100644 --- a/data-raw/biorecap_data.R +++ b/data-raw/biorecap_data.R @@ -1,43 +1,102 @@ # Create vector for all bioRxiv subjects ---------------------------------- +subjects <- list() + # https://www.biorxiv.org/alertsrss +subjects$biorxiv <- c("all", + "animal_behavior_and_cognition", + "biochemistry", + "bioengineering", + "bioinformatics", + "biophysics", + "cancer_biology", + "cell_biology", + "clinical_trials", + "developmental_biology", + "ecology", + "epidemiology", + "evolutionary_biology", + "genetics", + "genomics", + "immunology", + "microbiology", + "molecular_biology", + "neuroscience", + "paleontology", + "pathology", + "pharmacology_and_toxicology", + "plant_biology", + "scientific_communication_and_education", + "synthetic_biology", + "systems_biology", + "zoology") + +# https://www.medrxiv.org/content/alertsrss +subjects$medrxiv <- c("all", + "Addiction_Medicine", + "Allergy_and_Immunology", + "Anesthesia", + "Cardiovascular_Medicine", + "Dentistry_and_Oral_Medicine", + "Dermatology", + "Dermatology", + "endocrinology", + "Epidemiology", + "ecology", + "epidemiology", + "Forensic_Medicine", + "Gastroenterology", + "Genetic_and_Genomic_Medicine", + "Geriatric_Medicine", + "Health_Economics", + "Health_Informatics", + "Health_Policy", + "Health_Systems_and_Quality_Improvement", + "Hematology", + "hivaids", + "infectious_diseases", + "Intensive_Care_and_Critical_Care_Medicine", + "Medical_Education", + "Medical_Ethics", + "Nephrology", + "Neurology", + "Nursing", + "Nutrition", + "Obstetrics_and_Gynecology", + "Occupational_and_Environmental_Health", + "Oncology", + "Ophthalmology", + "Orthopedics", + "Otolaryngology", + "Pain_Medicine", + "Palliative_Medicine", + "Pathology", + "Pediatrics", + "Pharmacology_and_Therapeutics", + "Primary_Care_Research", + "Psychiatry_and_Clinical_Psychology", + "Public_and_Global_Health", + "Radiology_and_Imaging", + "Rehabilitation_Medicine_and_Physical_Therapy", + "Respiratory_Medicine", + "Rheumatology", + "Sexual_and_Reproductive_Health", + "Sports_Medicine", + "Surgery", + "Toxicology", + "Transplantation", + "Urology") -subjects <- c("all", - "animal_behavior_and_cognition", - "biochemistry", - "bioengineering", - "bioinformatics", - "biophysics", - "cancer_biology", - "cell_biology", - "clinical_trials", - "developmental_biology", - "ecology", - "epidemiology", - "evolutionary_biology", - "genetics", - "genomics", - "immunology", - "microbiology", - "molecular_biology", - "neuroscience", - "paleontology", - "pathology", - "pharmacology_and_toxicology", - "plant_biology", - "scientific_communication_and_education", - "synthetic_biology", - "systems_biology", - "zoology") +subjects <- lapply(subjects, tolower) usethis::use_data(subjects, overwrite=TRUE) -# Get titles, abstracts, summaries for preprints 2024-08-06 --------------- +# Get titles, abstracts, summaries for preprints 2024-09-25 --------------- library(biorecap) example_preprints <- - get_preprints(subject=c("bioinformatics", "genomics", "synthetic_biology")) |> + get_preprints(subject=c("bioinformatics", "infectious_diseases")) |> add_prompt() |> - add_summary(model="llama3.1:70b") + add_summary(model="llama3.2") usethis::use_data(example_preprints, overwrite=TRUE) readr::write_csv(example_preprints, here::here("inst/extdata/example_preprints.csv.gz")) diff --git a/data/example_preprints.rda b/data/example_preprints.rda index 9373f74..762dc4f 100644 Binary files a/data/example_preprints.rda and b/data/example_preprints.rda differ diff --git a/data/subjects.rda b/data/subjects.rda index 0dea33a..a1dd9d5 100644 Binary files a/data/subjects.rda and b/data/subjects.rda differ diff --git a/inst/extdata/example_preprints.csv.gz b/inst/extdata/example_preprints.csv.gz index 4fc1eb3..1089aec 100644 Binary files a/inst/extdata/example_preprints.csv.gz and b/inst/extdata/example_preprints.csv.gz differ diff --git a/inst/rmarkdown/templates/biorecap/skeleton/skeleton.Rmd b/inst/rmarkdown/templates/biorecap/skeleton/skeleton.Rmd index 35bddd8..6ddda5a 100644 --- a/inst/rmarkdown/templates/biorecap/skeleton/skeleton.Rmd +++ b/inst/rmarkdown/templates/biorecap/skeleton/skeleton.Rmd @@ -1,5 +1,5 @@ --- -title: "bioRxiv summary" +title: "biorecap summary" date: "`r format(Sys.Date(), '%B %d, %Y')`" output: html_document: @@ -44,7 +44,8 @@ if (!is.null(output_csv)) { ```{r write-report-content, results='asis'} for (i in unique(pp$subject)) { - cat("##", gsub("_", " ", i), "\n\n") + source <- pp$source[pp$subject==i] |> unique() |> paste(collapse=", ") + cat("##", sprintf("%s (%s)", gsub("_", " ", i), source), "\n\n") pp |> dplyr::filter(subject==i) |> tt_preprints() |> diff --git a/man/add_prompt.Rd b/man/add_prompt.Rd index cfec929..d70ef81 100644 --- a/man/add_prompt.Rd +++ b/man/add_prompt.Rd @@ -12,7 +12,7 @@ add_prompt(preprints, ...) \item{...}{Additional arguments to \code{\link[=build_prompt_preprint]{build_prompt_preprint()}}.} } \value{ -A data frame of bioRxiv preprints with a prompt added. +A data frame of preprints with a prompt added. } \description{ Add prompt to a data frame of preprints diff --git a/man/add_summary.Rd b/man/add_summary.Rd index d25a9a2..ea4ceb3 100644 --- a/man/add_summary.Rd +++ b/man/add_summary.Rd @@ -4,7 +4,7 @@ \alias{add_summary} \title{Generate a summary from a data frame of prompts} \usage{ -add_summary(preprints, model = "llama3.1") +add_summary(preprints, model = "llama3.2") } \arguments{ \item{preprints}{Output from \code{\link[=get_preprints]{get_preprints()}} followed by \code{\link[=add_prompt]{add_prompt()}}.} diff --git a/man/biorecap-package.Rd b/man/biorecap-package.Rd index 19e8ab3..4b2b89f 100644 --- a/man/biorecap-package.Rd +++ b/man/biorecap-package.Rd @@ -4,11 +4,11 @@ \name{biorecap-package} \alias{biorecap} \alias{biorecap-package} -\title{biorecap: Retrieve and summarize bioRxiv preprints with a local LLM using ollama} +\title{biorecap: Retrieve and summarize bioRxiv and medRxiv preprints with a local LLM using ollama} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} -Retrieve and summarize bioRxiv preprints with a local LLM using ollama. +Retrieve and summarize bioRxiv and medRxiv preprints with a local LLM using ollama. } \seealso{ Useful links: diff --git a/man/biorecap_report.Rd b/man/biorecap_report.Rd index 4934d67..8096276 100644 --- a/man/biorecap_report.Rd +++ b/man/biorecap_report.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/biorecap.R \name{biorecap_report} \alias{biorecap_report} -\title{Create a report from bioRxiv preprints} +\title{Create a report from bioRxiv/medRxiv preprints} \usage{ biorecap_report( output_dir = ".", @@ -22,7 +22,7 @@ biorecap_report( \item{model}{The model to use for generating summaries. See \code{\link[ollamar:list_models]{ollamar::list_models()}}.} -\item{use_example_preprints}{Use the example preprints data included with the package instead of fetching new data from bioRxiv. For diagnostic/testing purposes only.} +\item{use_example_preprints}{Use the example preprints data included with the package instead of fetching new data from bioRxiv/medRxiv. For diagnostic/testing purposes only.} \item{...}{Other arguments passed to \code{\link[rmarkdown:render]{rmarkdown::render()}}.} } @@ -30,7 +30,7 @@ biorecap_report( Nothing; called for its side effects to produce a report. } \description{ -Create a report from bioRxiv preprints +Create a report from bioRxiv/medRxiv preprints } \examples{ \dontrun{ diff --git a/man/build_prompt_subject.Rd b/man/build_prompt_subject.Rd index 3732804..378b0e3 100644 --- a/man/build_prompt_subject.Rd +++ b/man/build_prompt_subject.Rd @@ -9,8 +9,7 @@ build_prompt_subject( title, summary, nsentences = 5L, - instructions = - c("I am giving you information about preprints published in bioRxiv recently.", + instructions = c("I am giving you information about recent bioRxiv/medRxiv preprints.", "I'll give you the subject, preprint titles, and short summary of each paper.", "Please provide a general summary new advances in this subject/field in general.", "Provide this summary of the field in as many sentences as I instruct.", diff --git a/man/figures/report_screenshot.jpg b/man/figures/report_screenshot.jpg new file mode 100644 index 0000000..6c901e4 Binary files /dev/null and b/man/figures/report_screenshot.jpg differ diff --git a/man/figures/report_screenshot.png b/man/figures/report_screenshot.png deleted file mode 100644 index 4ba60f9..0000000 Binary files a/man/figures/report_screenshot.png and /dev/null differ diff --git a/man/get_preprints.Rd b/man/get_preprints.Rd index bed993d..dcc3750 100644 --- a/man/get_preprints.Rd +++ b/man/get_preprints.Rd @@ -2,29 +2,23 @@ % Please edit documentation in R/biorecap.R \name{get_preprints} \alias{get_preprints} -\title{Get bioRxiv preprints} +\title{Get bioRxiv/medRxiv preprints} \usage{ -get_preprints( - subject = "all", - baseurl = "https://connect.biorxiv.org/biorxiv_xml.php?subject=", - clean = TRUE -) +get_preprints(subject = "all", clean = TRUE) } \arguments{ -\item{subject}{A character vector of valid biorxiv subjects. See \link{subjects}.} - -\item{baseurl}{The base URL for the biorxiv RSS feed. Default is \verb{https://connect.biorxiv.org/biorxiv_xml.php?subject=}. Do not change unless you know what you are doing.} +\item{subject}{A character vector of valid bioRxiv and/or medRxiv subjects. See \link{subjects}.} \item{clean}{Logical; try to strip out graphical abstract information? If TRUE, this strips away any text between \code{O_FIG} and \code{C_FIG}, and the words \verb{graphical abstract} from the abstract text in the RSS feed.} } \value{ -A data frame of bioRxiv preprints. +A data frame of preprints from bioRxiv and/or medRxiv. } \description{ -Get bioRxiv preprints +Get bioRxiv/medRxiv preprints } \examples{ -preprints <- get_preprints(subject=c("bioinformatics", "genomics")) +preprints <- get_preprints(subject=c("bioinformatics", "Public_and_Global_Health")) preprints } diff --git a/man/subjects.Rd b/man/subjects.Rd index df13f96..13fe91f 100644 --- a/man/subjects.Rd +++ b/man/subjects.Rd @@ -5,7 +5,7 @@ \alias{subjects} \title{bioRxiv subjects} \format{ -A character vector +A list of character vectors of subjects, one for bioRxiv, one for medRxiv. } \usage{ subjects diff --git a/tests/testthat/test-data.R b/tests/testthat/test-data.R index 015cbd6..7fad2c2 100644 --- a/tests/testthat/test-data.R +++ b/tests/testthat/test-data.R @@ -1,8 +1,11 @@ test_that("subjects", { - expect_true(is.character(subjects)) - expect_identical(length(subjects), 27L) + expect_true(is.list(subjects)) + expect_true(is.character(subjects$biorxiv)) + expect_identical(length(subjects$biorxiv), 27L) + expect_true(is.character(subjects$medrxiv)) + expect_identical(length(subjects$medrxiv), 54L) }) test_that("example_preprints", { expect_true(is.data.frame(example_preprints)) - expect_identical(colnames(example_preprints), c("subject", "title", "url", "abstract", "prompt", "summary")) + expect_identical(colnames(example_preprints), c("source", "subject", "title", "url", "abstract", "prompt", "summary")) })