Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for PDFs in Claude and Gemini #265

Merged
merged 8 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ Collate:
'utils-coro.R'
'chat.R'
'content-image.R'
'content-pdf.R'
'content-tools.R'
'ellmer-package.R'
'httr2.R'
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ export(Content)
export(ContentImage)
export(ContentImageInline)
export(ContentImageRemote)
export(ContentPDF)
export(ContentText)
export(ContentToolRequest)
export(ContentToolResult)
Expand Down Expand Up @@ -33,6 +34,8 @@ export(chat_vllm)
export(content_image_file)
export(content_image_plot)
export(content_image_url)
export(content_pdf_file)
export(content_pdf_url)
export(contents_html)
export(contents_markdown)
export(contents_text)
Expand Down
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# ellmer (development version)

* New `content_pdf_file()` and `content_pdf_url()` allow you to upload PDFs to supported models. Models that currently support PDFs are Google Gemini and Claude Anthropic. With help from @walkerke and @andrie (#265).

* `Chat$get_model()` returns the model name (#299).

* `chat_gemini()` now defaults to using the gemini-2.0-flash model.
Expand Down
28 changes: 17 additions & 11 deletions R/content-image.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@

#' Encode image content for chat input
#' Encode images for chat input
#'
#' These functions are used to prepare image URLs and files for input to the
#' chatbot. The `content_image_url()` function is used to provide a URL to an
Expand Down Expand Up @@ -36,15 +35,8 @@ content_image_url <- function(url, detail = c("auto", "low", "high")) {
detail <- arg_match(detail)

if (grepl("^data:", url)) {
# https://developer.mozilla.org/en-US/docs/Web/URI/Schemes/data
parts <- strsplit(sub("^data:", "", url), ";")[[1]]
if (length(parts) != 2 || !grepl("^base64,", parts[[2]])) {
cli::cli_abort("{.arg url} is not a valid data url.")
}
content_type <- parts[[1]]
base64 <- sub("^base64,", "", parts[[2]])

ContentImageInline(content_type, base64)
parsed <- parse_data_url(url)
ContentImageInline(parsed$content_type, parsed$base64)
} else {
ContentImageRemote(url = url, detail = detail)
}
Expand Down Expand Up @@ -155,3 +147,17 @@ content_image_plot <- function(width = 768, height = 768) {

content_image_file(path, "image/png", resize = "none")
}


parse_data_url <- function(url, error_call = caller_env()) {
# https://developer.mozilla.org/en-US/docs/Web/URI/Schemes/data
parts <- strsplit(sub("^data:", "", url), ";")[[1]]
if (length(parts) != 2 || !grepl("^base64,", parts[[2]])) {
cli::cli_abort("{.arg url} is not a valid data url.", call = error_call)
}

list(
content_type = parts[[1]],
base64 = sub("^base64,", "", parts[[2]])
)
}
42 changes: 42 additions & 0 deletions R/content-pdf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#' Encode PDFs content for chat input
#'
#' @description
#' These functions are used to prepare PDFs as input to the chatbot. The
#' `content_pdf_url()` function is used to provide a URL to an PDF file,
#' while `content_pdf_file()` is used to for local PDF files.
#'
#' Not all providers support PDF input, so check the documentation for the
#' provider you are using.
#'
#' @param path,url Path or URL to a PDF file.
#' @return A `ContentPDF` object
#' @export
content_pdf_file <- function(path) {
check_string(path, allow_empty = FALSE)
if (!file.exists(path) || dir.exists(path)) {
cli::cli_abort("{.arg path} must be an existing file.")
}

ContentPDF(
type = "application/pdf",
data = base64enc::base64encode(path)
)
}

#' @rdname content_pdf_file
#' @export
content_pdf_url <- function(url) {
if (grepl("^data:", url)) {
parsed <- parse_data_url(url)
ContentPDF(parsed$content_type, parsed$base64)
} else {
# TODO: need seperate ContentPDFRemote type so we can use file upload
# apis where they exist. Might need some kind of mutable state so can
# record point to uploaded file.
path <- tempfile(fileext = ".pdf")
on.exit(unlink(path))

resp <- httr2::req_perform(httr2::request(url), path = path)
content_pdf_file(path)
}
}
15 changes: 15 additions & 0 deletions R/content.R
Original file line number Diff line number Diff line change
Expand Up @@ -270,3 +270,18 @@ as_content <- function(x, error_call = caller_env()) {
)
}
}

#' @rdname Content
#' @export
ContentPDF <- new_class(
"ContentPDF",
parent = Content,
properties = list(
type = prop_string(),
data = prop_string()
)
)

method(format, ContentPDF) <- function(x, ...) {
"<PDF document>"
}
24 changes: 23 additions & 1 deletion R/provider-bedrock.R
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ NULL
#' [Claude](https://aws.amazon.com/bedrock/claude/).
#'
#' ## Authentication
#'
#'
#' Authenthication is handled through \{paws.common\}, so if authenthication
#' does not work for you automatically, you'll need to follow the advice
#' at <https://www.paws-r-sdk.com/#credentials>. In particular, if your
Expand Down Expand Up @@ -267,6 +267,20 @@ method(as_json, list(ProviderBedrock, ContentImageInline)) <- function(provider,
)
}

# https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_DocumentBlock.html
method(as_json, list(ProviderBedrock, ContentPDF)) <- function(provider, x) {
list(
document = list(
#> This field is vulnerable to prompt injections, because the model
#> might inadvertently interpret it as instructions. Therefore, we
#> that you specify a neutral name.
name = bedrock_document_name(),
format = "pdf",
source = list(bytes = x@data)
)
)
}

# https://docs.aws.amazon.com/bedrock/latest/APIReference/API_runtime_ToolUseBlock.html
method(as_json, list(ProviderBedrock, ContentToolRequest)) <- function(provider, x) {
list(
Expand Down Expand Up @@ -328,3 +342,11 @@ locate_aws_credentials <- function(profile) {
aws_creds_cache <- function(profile) {
credentials_cache(key = hash(c("aws", profile)))
}

bedrock_document_name <- local({
i <- 1
function() {
i <<- i + 1
paste0("document-", i)
}
})
11 changes: 11 additions & 0 deletions R/provider-claude.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,17 @@ method(as_json, list(ProviderClaude, ContentText)) <- function(provider, x) {
list(type = "text", text = x@text)
}

method(as_json, list(ProviderClaude, ContentPDF)) <- function(provider, x) {
list(
type = "document",
source = list(
type = "base64",
media_type = x@type,
data = x@data
)
)
}

method(as_json, list(ProviderClaude, ContentImageRemote)) <- function(provider, x) {
cli::cli_abort("Claude doesn't support remote images")
}
Expand Down
9 changes: 9 additions & 0 deletions R/provider-gemini.R
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,15 @@ method(as_json, list(ProviderGemini, ContentText)) <- function(provider, x) {
}
}

method(as_json, list(ProviderGemini, ContentPDF)) <- function(provider, x) {
list(
inlineData = list(
mimeType = x@type,
data = x@data
)
)
}

# https://ai.google.dev/api/caching#FileData
method(as_json, list(ProviderGemini, ContentImageRemote)) <- function(provider, x) {
cli::cli_abort("Gemini doesn't support remote images")
Expand Down
59 changes: 30 additions & 29 deletions _pkgdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,36 +10,37 @@ development:
mode: auto

reference:
- title: Chatbots
contents:
- starts_with("chat_")
- token_usage
- title: Chatbots
contents:
- starts_with("chat_")
- token_usage

- title: Chat helpers
contents:
- create_tool_def
- content_image_url
- starts_with("live_")
- interpolate
- title: Chat helpers
contents:
- create_tool_def
- content_image_url
- content_pdf_url
- starts_with("live_")
- interpolate

- title: Tools and structured data
contents:
- tool
- type_boolean
- title: Tools and structured data
contents:
- tool
- type_boolean

- title: Objects
desc: >
These classes abstact across behaviour differences in chat providers so
that for typical ellmer use you don't need to worry about them. You'll need
to learn more about the objects if you're doing something that's only
supported by one provider, or if you're implementing a new provider.
contents:
- Turn
- Provider
- Content
- Chat
- Type
- title: Objects
desc: >
These classes abstact across behaviour differences in chat providers so
that for typical ellmer use you don't need to worry about them. You'll need
to learn more about the objects if you're doing something that's only
supported by one provider, or if you're implementing a new provider.
contents:
- Turn
- Provider
- Content
- Chat
- Type

- title: Utilities
contents:
- contents_text
- title: Utilities
contents:
- contents_text
5 changes: 5 additions & 0 deletions ellmer.Rproj
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@ SaveWorkspace: No
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
LineEndingConversion: Posix
Expand Down
3 changes: 3 additions & 0 deletions man/Content.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/content_image_url.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

25 changes: 25 additions & 0 deletions man/content_pdf_file.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file added tests/testthat/apples.pdf
Binary file not shown.
12 changes: 12 additions & 0 deletions tests/testthat/helper-provider.R
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,15 @@ test_images_remote_error <- function(chat_fun) {
)
expect_length(chat$get_turns(), 0)
}

# PDF ---------------------------------------------------------------------

test_pdf_local <- function(chat_fun) {
chat <- chat_fun()
response <- chat$chat(
"What's the title of this document?",
content_pdf_file(test_path("apples.pdf"))
)
expect_match(response, "Apples are tasty")
expect_match(chat$chat("What apple is not tasty?"), "red delicious")
}
4 changes: 4 additions & 0 deletions tests/testthat/test-content-pdf.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
test_that("can create pdf from path", {
obj <- content_pdf_file(test_path("apples.pdf"))
expect_s3_class(obj, "ellmer::ContentPDF")
})
7 changes: 7 additions & 0 deletions tests/testthat/test-provider-bedrock.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ test_that("can use images", {
test_images_remote_error(chat_fun)
})

test_that("can use pdfs", {
chat_fun <- chat_bedrock

test_pdf_local(chat_fun)
})


# Auth --------------------------------------------------------------------

test_that("AWS credential caching works as expected", {
Expand Down
6 changes: 6 additions & 0 deletions tests/testthat/test-provider-claude.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,9 @@ test_that("can use images", {
test_images_inline(chat_fun)
test_images_remote_error(chat_fun)
})

test_that("can use pdfs", {
chat_fun <- chat_claude

test_pdf_local(chat_fun)
})
Loading
Loading