Skip to content

Commit

Permalink
Add pdf input for Sonnet (#142)
Browse files Browse the repository at this point in the history
* llm-claude.el (llm-claude--multipart-content): Handle PDF's in
multipart content.
(llm-capabilities): Add pdf-input.
* llm-integration-test.el (llm-pdf-chat): New integration test.
* llm-models.el (llm-model): Add pdf-input to capabilities docstring.
(llm-models): Add pdf-input to capabilities of Claude 3.5 Sonnet.
* llm.el (llm-media):
(llm-capabilities):  Add pdf-input to documentation.
  • Loading branch information
ultronozm authored Jan 22, 2025
1 parent 600ea5e commit 358bbaa
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 18 deletions.
30 changes: 17 additions & 13 deletions llm-claude.el
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,22 @@

(defun llm-claude--multipart-content (content)
"Return CONTENT as a list of Claude multipart content."
(vconcat (mapcar (lambda (part)
(cond ((stringp part)
`(:type "text"
:text ,part))
((llm-media-p part)
`(:type "image"
:source (:type "base64"
:media_type ,(llm-media-mime-type part)
:data ,(base64-encode-string (llm-media-data part) t))))
(t
(error "Unsupported multipart content: %s" part))))
(llm-multipart-parts content))))
(vconcat
(mapcar (lambda (part)
(cond ((stringp part)
`(:type "text"
:text ,part))
((llm-media-p part)
(let ((source (list :type "base64"
:media_type (llm-media-mime-type part)
:data (base64-encode-string (llm-media-data part) t))))
`(:type ,(if (equal (llm-media-mime-type part) "application/pdf")
"document"
"image")
:source ,source)))
(t
(error "Unsupported multipart content: %s" part))))
(llm-multipart-parts content))))

(cl-defmethod llm-provider-extract-tool-uses ((_ llm-claude) response)
(let ((content (append (assoc-default 'content response) nil)))
Expand Down Expand Up @@ -178,7 +182,7 @@
"Claude")

(cl-defmethod llm-capabilities ((_ llm-claude))
(list 'streaming 'function-calls 'image-input))
(list 'streaming 'function-calls 'image-input 'pdf-input))

(cl-defmethod llm-provider-append-to-prompt ((_ llm-claude) prompt result
&optional tool-use-results)
Expand Down
17 changes: 17 additions & 0 deletions llm-integration-test.el
Original file line number Diff line number Diff line change
Expand Up @@ -290,6 +290,23 @@ else. We really just want to see if it's in the right ballpark."
(should (stringp result))
(should (llm-integration-test-string-eq "owl" (string-trim (downcase result)))))))

(llm-def-integration-test
llm-pdf-chat (provider)
(when (member 'pdf-input (llm-capabilities provider))
(let* ((pdf-data
(with-temp-buffer (set-buffer-multibyte nil)
(insert-file-contents-literally
(expand-file-name "test.pdf" llm-integration-current-directory))
(buffer-string)))
(result (llm-chat
provider
(llm-make-chat-prompt
(llm-make-multipart
"What symbol occurs in the PDF file? If you do not see a PDF file, please let me know. If you do, please answer in one letter, without punctuation or whitespace."
(make-llm-media :mime-type "application/pdf" :data pdf-data))))))
(should (stringp result))
(should (llm-integration-test-string-eq "x" (string-trim (downcase result)))))))

(llm-def-integration-test llm-json-test (provider)
(when (member 'json-response (llm-capabilities provider))
(let ((result (llm-chat
Expand Down
6 changes: 3 additions & 3 deletions llm-models.el
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ NAME is the name of the model, appropriate for showing a user.
CAPABILITIES is a list of symbols representing the capabilities of the
model, one of `embedding', `generation', `tool-use',
`image-input', `image-output', `audio-input', `video-input', `caching'
and `free-software'.
`image-input', `image-output', `audio-input', `video-input', 'pdf-input',
`caching' and `free-software'.
REGEX is a regular expression that can be used to identify the model, uniquely (it shouldn't conflict with any other model)"
name
Expand Down Expand Up @@ -103,7 +103,7 @@ REGEX is a regular expression that can be used to identify the model, uniquely (
;; https://docs.anthropic.com/en/docs/about-claude/models
(make-llm-model
:name "Claude 3.5 Sonnet" :symbol 'claude-3.5-sonnet
:capabilities '(generation tool-use image-input caching)
:capabilities '(generation tool-use image-input pdf-input caching)
:context-length 200000
:regex "claude-3.5-sonnet")
(make-llm-model
Expand Down
6 changes: 4 additions & 2 deletions llm.el
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ MIME types are accepted by all providers.
DATA is a (binary) string containing the data. The string should use
unibyte encoding.
This should only be used if the `image-input' or `audio-input'
capability is available, as indicated by `llm-capabilities'."
This should only be used if the `image-input' or `audio-input' or
`pdf-input' capability is available, as indicated by `llm-capabilities'."
mime-type data)

(defun llm--image-to-media (image)
Expand Down Expand Up @@ -556,6 +556,8 @@ won't have any partial responses, so basically just operates like
`image-input': the LLM can accept images as input.
`pdf-input': the LLM can accept PDF documents as input.
`json-response': the LLM can be requested to return responses only in
JSON format.
Expand Down
Binary file added test.pdf
Binary file not shown.

0 comments on commit 358bbaa

Please sign in to comment.