Merge pull request #109 from alan-turing-institute/anthropic-multimodal

Add anthropic multimodal option
alan-turing-institute · Oct 29, 2024 · 490d33c · 490d33c
2 parents c6ee30c + b7209cc
commit 490d33c
Show file tree

Hide file tree

Showing 12 changed files with 622 additions and 13 deletions.
diff --git a/examples/anthropic/anthropic-multimodal.ipynb b/examples/anthropic/anthropic-multimodal.ipynb
diff --git a/examples/anthropic/data/input/anthropic-multimodal-example.jsonl b/examples/anthropic/data/input/anthropic-multimodal-example.jsonl
@@ -0,0 +1,3 @@
+{"id": 0, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": ["describe what is happening in this image", {"type": "image", "source": {"media": "pantani_giro.jpg", "media_type": "image/jpeg"}}]}], "parameters": {"temperature": 1, "max_tokens": 100}}
+{"id": 1, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": [{"type": "image", "source": {"media": "mortadella.jpg", "media_type": "image/jpeg"}}, "what is this?"]}], "parameters": {"temperature": 1, "max_tokens": 100}}
+{"id": 2, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": ["what is in this image?", {"type": "image", "source": {"media": "pantani_giro.jpg", "media_type": "image/jpeg"}}]}, {"role": "assistant", "content": "This is image shows a group of cyclists."}, {"role": "user", "content": "are there any notable cyclists in this image? what are their names?"}], "parameters": {"temperature": 1, "max_tokens": 100}}
diff --git a/examples/anthropic/data/media/mortadella.jpg b/examples/anthropic/data/media/mortadella.jpg
diff --git a/examples/anthropic/data/media/pantani_giro.jpg b/examples/anthropic/data/media/pantani_giro.jpg
diff --git a/...ropic-multimodal-example/29-10-2024-15-36-27-completed-anthropic-multimodal-example.jsonl b/...ropic-multimodal-example/29-10-2024-15-36-27-completed-anthropic-multimodal-example.jsonl
@@ -0,0 +1,3 @@
+{"id": 0, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": ["describe what is happening in this image", {"type": "image", "source": {"media": "pantani_giro.jpg", "media_type": "image/jpeg"}}]}], "parameters": {"temperature": 1, "max_tokens": 100}, "timestamp_sent": "29-10-2024-15-36-29", "response": "This image shows professional cyclists competing in what appears to be a cycling race, likely from the 1990s based on the equipment and team jerseys visible. There are several riders in the frame, including one wearing the distinctive pink jersey (known as the maglia rosa in the Giro d'Italia). The cyclists are riding along a stone wall with an iron fence on top, and they're using classic road racing bikes with distinctive team color schemes - notably some turquoise Bian"}
+{"id": 1, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": [{"type": "image", "source": {"media": "mortadella.jpg", "media_type": "image/jpeg"}}, "what is this?"]}], "parameters": {"temperature": 1, "max_tokens": 100}, "timestamp_sent": "29-10-2024-15-36-31", "response": "These appear to be mortadella and other Italian cold cuts or processed meats. The larger ones with the string/twine pattern wrapped around them are likely mortadella (a type of Italian bologna), while the pink spotted ones appear to be a different variety of cold cut or processed meat product. The spotted pattern is characteristic of certain Italian deli meats where small pieces of fat or other ingredients are distributed throughout the meat. These are commonly sliced and served in sandwiches or on"}
+{"id": 2, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": ["what is in this image?", {"type": "image", "source": {"media": "pantani_giro.jpg", "media_type": "image/jpeg"}}]}, {"role": "assistant", "content": "This is image shows a group of cyclists."}, {"role": "user", "content": "are there any notable cyclists in this image? what are their names?"}], "parameters": {"temperature": 1, "max_tokens": 100}, "timestamp_sent": "29-10-2024-15-36-33", "response": "From the image, I can see this appears to be from a professional cycling race, likely from the 1990s based on the equipment and jerseys. While there are professional cyclists in the image, including one wearing the pink jersey (which is the leader's jersey in the Giro d'Italia), I should refrain from identifying specific individuals by name. The image shows a group of riders from various teams including Mercatone Uno and what appears to be racing in a major"}
diff --git a/...anthropic-multimodal-example/29-10-2024-15-36-27-input-anthropic-multimodal-example.jsonl b/...anthropic-multimodal-example/29-10-2024-15-36-27-input-anthropic-multimodal-example.jsonl
@@ -0,0 +1,3 @@
+{"id": 0, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": ["describe what is happening in this image", {"type": "image", "source": {"media": "pantani_giro.jpg", "media_type": "image/jpeg"}}]}], "parameters": {"temperature": 1, "max_tokens": 100}}
+{"id": 1, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": [{"type": "image", "source": {"media": "mortadella.jpg", "media_type": "image/jpeg"}}, "what is this?"]}], "parameters": {"temperature": 1, "max_tokens": 100}}
+{"id": 2, "api": "anthropic", "model_name": "claude-3-5-sonnet-20241022", "prompt": [{"role": "user", "content": ["what is in this image?", {"type": "image", "source": {"media": "pantani_giro.jpg", "media_type": "image/jpeg"}}]}, {"role": "assistant", "content": "This is image shows a group of cyclists."}, {"role": "user", "content": "are there any notable cyclists in this image? what are their names?"}], "parameters": {"temperature": 1, "max_tokens": 100}}
diff --git a/...put/anthropic-multimodal-example/29-10-2024-15-36-27-log-anthropic-multimodal-example.txt b/...put/anthropic-multimodal-example/29-10-2024-15-36-27-log-anthropic-multimodal-example.txt
@@ -0,0 +1 @@
+29-10-2024, 15:36: Completed experiment: anthropic-multimodal-example.jsonl! Experiment processing time: 9.878 seconds, Average time per query: 3.293 seconds
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -29,6 +29,7 @@ nav:
       - Anthropic:
           - Example: examples/anthropic/README.md
           - Notebook: examples/anthropic/anthropic.ipynb
+          - Multimodal: examples/anthropic/anthropic-multimodal.ipynb
       - Gemini:
           - Example: examples/gemini/README.md
           - Notebook: examples/gemini/gemini.ipynb

diff --git a/src/prompto/apis/anthropic/anthropic.py b/src/prompto/apis/anthropic/anthropic.py
@@ -5,6 +5,7 @@
 
 from prompto.apis.anthropic.anthropic_utils import (
     anthropic_chat_roles,
+    convert_dict_to_input,
     process_response,
 )
 from prompto.apis.base import AsyncAPI
@@ -331,7 +332,7 @@ async def _query_history(self, prompt_dict: dict, index: int | str) -> dict:
 
         # if system message is present, then it must be the only one
         if len(system) == 0:
-            system = None
+            system = ""
         elif len(system) == 1:
             system = system[0]
         else:
@@ -342,7 +343,12 @@ async def _query_history(self, prompt_dict: dict, index: int | str) -> dict:
         try:
             response = await client.messages.create(
                 model=model_name,
-                messages=messages,
+                messages=[
+                    convert_dict_to_input(
+                        content_dict=x, media_folder=self.settings.media_folder
+                    )
+                    for x in messages
+                ],
                 system=system,
                 **generation_config,
             )

diff --git a/src/prompto/apis/anthropic/anthropic_utils.py b/src/prompto/apis/anthropic/anthropic_utils.py
@@ -1,8 +1,163 @@
+import base64
+import os
+
 from anthropic.types.message import Message
 
 anthropic_chat_roles = set(["user", "assistant"])
 
 
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode("utf-8")
+
+
+def parse_content_value(content: dict | str, media_folder: str) -> dict:
+    """
+    Parse content dictionary and create a dictionary input for Anthropic API.
+    If content is a string, a dictionary to represent a text object is returned.
+    If content is a dictionary, expected keys are:
+    - type: str, multimedia type, one of ["text", "image"]
+
+    If type is "text", expect a key "text" with the text content.
+    If type is "image", expect a key "source" which is a dictionary with keys:
+    - url: str, URL of the image (can be a local path or a URL starting with "https://")
+    - detail: str, optional detail parameter (default is "auto)
+
+    Parameters
+    ----------
+    content : dict | str
+        Either a dictionary or a string which defines a multimodal object.
+    media_folder : str
+        Folder where media files are stored ({data_folder}/media).
+
+    Returns
+    -------
+    dict
+        Dictionary which defines a text or image object
+    """
+    if isinstance(content, str):
+        return {"type": "text", "text": content}
+
+    # read multimedia type
+    type = content.get("type")
+    if type is None:
+        raise ValueError("Multimedia type is not specified")
+
+    # create dictionary based on multimedia type
+    if type == "text":
+        # read file location
+        text = content.get("text")
+        if text is None:
+            raise ValueError(
+                "Got type == 'text', but 'text' is not a key in the content dictionary"
+            )
+
+        return {"type": "text", "text": text}
+    else:
+        if type == "image":
+            # read file location
+            source = content.get("source")
+            if source is None:
+                raise ValueError(
+                    "Got type == 'image', but 'source' is not a key in the content dictionary"
+                )
+
+            if not isinstance(source, dict):
+                raise ValueError(
+                    "Got type == 'image', but 'source' is not a dictionary"
+                )
+
+            # get media type
+            media_type = source.get("media_type")
+            if media_type is None:
+                raise ValueError(
+                    "Got type == 'image', but 'media_type' is not a key in the content['source'] dictionary"
+                )
+
+            # get image source
+            media = source.get("media")
+            if media is None:
+                raise ValueError(
+                    "Got type == 'image', but 'media' is not a key in the content['source'] dictionary"
+                )
+
+            # url is a local path and needs to be encoded to base64
+            image_path = os.path.join(media_folder, media)
+            base64_image = encode_image(image_path)
+            return {
+                "type": "image",
+                "source": {
+                    "type": "base64",
+                    "media_type": media_type,
+                    "data": base64_image,
+                },
+            }
+        else:
+            raise ValueError(f"Unsupported multimedia type: {type}")
+
+
+def parse_content(
+    contents: list[dict | str] | dict | str, media_folder: str
+) -> list[dict]:
+    """
+    Parse contents data and create a list of multimedia data objects.
+    If contents is a single dictionary, a list with a single multimedia data object is returned.
+
+    Parameters
+    ----------
+    contents : list[dict | str] | dict | str
+        Contents data to parse and create Part object(s).
+        Can be a list of dictionaries and strings, or a single dictionary or string.
+    media_folder : str
+        Folder where media files are stored ({data_folder}/media).
+
+    Returns
+    -------
+    list[dict]
+        List of dictionaries each defining a text or image object
+    """
+    # convert to list[dict | str]
+    if isinstance(contents, dict) or isinstance(contents, str):
+        contents = [contents]
+
+    return [parse_content_value(p, media_folder=media_folder) for p in contents]
+
+
+def convert_dict_to_input(content_dict: dict, media_folder: str) -> dict:
+    """
+    Convert dictionary to an input that can be used by the Anthropic API.
+    The output is a dictionary with keys "role" and "contents".
+
+    Parameters
+    ----------
+    content_dict : dict
+        Content dictionary with keys "role" and "content" where
+        the values are strings.
+    media_folder : str
+        Folder where media files are stored ({data_folder}/media).
+
+    Returns
+    -------
+    dict
+        dict with keys "role" and "contents"  where the value of
+        role is either "user" or "model" and the value of
+        contents is a list of inputs to make up an input (which can include
+        text or image/video inputs).
+    """
+    if "role" not in content_dict:
+        raise KeyError("role key is missing in content dictionary")
+    if "content" not in content_dict:
+        raise KeyError("content key is missing in content dictionary")
+
+    return {
+        "role": content_dict["role"],
+        "content": parse_content(
+            content_dict["content"],
+            media_folder=media_folder,
+        ),
+    }
+
+
 def process_response(response: Message) -> str | list[str]:
     """
     Helper function to process the response from the Anthropic API.

diff --git a/src/prompto/apis/openai/openai_utils.py b/src/prompto/apis/openai/openai_utils.py
@@ -44,7 +44,7 @@ def parse_content_value(content: dict | str, media_folder: str) -> dict:
     if type is None:
         raise ValueError("Multimedia type is not specified")
 
-    # create Part object based on multimedia type
+    # create dictionary based on multimedia type
     if type == "text":
         # read file location
         text = content.get("text")

diff --git a/tests/apis/anthropic/test_anthropic_history_input.py b/tests/apis/anthropic/test_anthropic_history_input.py
@@ -120,7 +120,15 @@ async def test_anthropic_query_history(
     mock_anthropic.assert_awaited_once_with(
         model=prompt_dict_history["model_name"],
         messages=[
-            {"role": "user", "content": prompt_dict_history["prompt"][1]["content"]}
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history["prompt"][1]["content"],
+                    }
+                ],
+            }
         ],
         system=prompt_dict_history["prompt"][0]["content"],
         **prompt_dict_history["parameters"],
@@ -162,7 +170,15 @@ async def test_anthropic_query_history_error(
     mock_anthropic.assert_awaited_once_with(
         model=prompt_dict_history["model_name"],
         messages=[
-            {"role": "user", "content": prompt_dict_history["prompt"][1]["content"]}
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history["prompt"][1]["content"],
+                    }
+                ],
+            }
         ],
         system=prompt_dict_history["prompt"][0]["content"],
         **prompt_dict_history["parameters"],
@@ -221,18 +237,33 @@ async def test_anthropic_query_history_no_system(
         messages=[
             {
                 "role": "user",
-                "content": prompt_dict_history_no_system["prompt"][0]["content"],
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history_no_system["prompt"][0]["content"],
+                    }
+                ],
             },
             {
                 "role": "assistant",
-                "content": prompt_dict_history_no_system["prompt"][1]["content"],
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history_no_system["prompt"][1]["content"],
+                    }
+                ],
             },
             {
                 "role": "user",
-                "content": prompt_dict_history_no_system["prompt"][2]["content"],
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history_no_system["prompt"][2]["content"],
+                    }
+                ],
             },
         ],
-        system=None,
+        system="",
         **prompt_dict_history_no_system["parameters"],
     )
 
@@ -278,18 +309,33 @@ async def test_anthropic_query_history_error_no_system(
         messages=[
             {
                 "role": "user",
-                "content": prompt_dict_history_no_system["prompt"][0]["content"],
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history_no_system["prompt"][0]["content"],
+                    }
+                ],
             },
             {
                 "role": "assistant",
-                "content": prompt_dict_history_no_system["prompt"][1]["content"],
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history_no_system["prompt"][1]["content"],
+                    }
+                ],
             },
             {
                 "role": "user",
-                "content": prompt_dict_history_no_system["prompt"][2]["content"],
+                "content": [
+                    {
+                        "type": "text",
+                        "text": prompt_dict_history_no_system["prompt"][2]["content"],
+                    }
+                ],
             },
         ],
-        system=None,
+        system="",
         **prompt_dict_history_no_system["parameters"],
     )