Add streaming responses to describe_image

AlfrescoLabs · dsibilio · Sep 12, 2024 · Sep 12, 2024 · Sep 12, 2024 · Sep 16, 2024
commit 3e32f25120a6b09866d2fd8c816e0b4fd45c33d9
diff --git a/alfresco_ai_assistant.py b/alfresco_ai_assistant.py
@@ -183,7 +183,9 @@ def describe_image(document_title: str, user_prompt: str) -> str:
     """Explain, describe, analyze an image. Requires the whole, unprocessed, user prompt/input."""
     document = get_document_content(document_title)
     image_b64 = base64.b64encode(document["content"]).decode("utf-8")
-    return vision_llm.invoke(user_prompt if user_prompt else "Describe the image.", images=[image_b64])
+    response = vision_llm.stream(user_prompt if user_prompt else "Describe the image.", images=[image_b64])
+    st.write_stream(response)
+    return None
 
 tools = [discovery, transform_content, translate_content, redact_content, list_recent_content_snippets, copy_file, create_pdf_report, describe_image]
 rendered_tools = render_text_description(tools)

diff --git a/commons.py b/commons.py
@@ -61,7 +61,16 @@ def load_llm(llm_name: str, logger=BaseLogger(), config={}):
         )
     elif llm_name == "llava":
         logger.info("LLM: Using Llava")
-        return Ollama(base_url=config["ollama_base_url"], model="llava")
+        return ChatOllama(
+            temperature=0,
+            base_url=config["ollama_base_url"],
+            model="llava",
+            streaming=True,
+            # seed=2,
+            top_k=10,  # A higher value (100) will give more diverse answers, while a lower value (10) will be more conservative.
+            top_p=0.3,  # Higher value (0.95) will lead to more diverse text, while a lower value (0.5) will generate more focused text.
+            num_ctx=3072,  # Sets the size of the context window used to generate the next token.
+        )
     elif len(llm_name):
         logger.info(f"LLM: Using Ollama: {llm_name}")
         return ChatOllama(