Don't require setting streaming in chat engine (run-llama#6924)

truera · Jul 15, 2023 · 4132938 · 4132938
1 parent fb8ed29
commit 4132938
Show file tree

Hide file tree

Showing 4 changed files with 82 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -9,6 +9,7 @@
 ### Bug Fixes / Nits
 - fix null metadata for searching existing vector dbs (#6912)
 - add module guide docs for `SimpleDirectoryReader` (#6916)
+- make sure `CondenseQuestionChatEngine` streaming chat endpoints work even if not explicitly setting `streaming=True` in the underlying query engine.
 
 ## [v0.7.8] - 2023-07-13
 

diff --git a/docs/core_modules/query_modules/chat_engines/usage_pattern.md b/docs/core_modules/query_modules/chat_engines/usage_pattern.md
@@ -97,7 +97,7 @@ This somewhat inconsistent with query engine (where you pass in a `streaming=Tru
 ```
 
 ```python
-chat_engine = index.as_chat_engine(streaming=True)
+chat_engine = index.as_chat_engine()
 streaming_response = chat_engine.stream_chat("Tell me a joke.")
 for token in streaming_response.response_gen:
     print(token, end="")

diff --git a/docs/examples/chat_engine/chat_engine_condense_question.ipynb b/docs/examples/chat_engine/chat_engine_condense_question.ipynb
@@ -329,49 +329,38 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "chat_engine = index.as_chat_engine(verbose=True, streaming=True)"
+    "chat_engine = index.as_chat_engine(verbose=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 3,
    "id": "250abd43",
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Querying with: What did Paul Graham do after YC? Can you write a poem to describe his activities?\n",
-      "After YC, Paul Graham took a break,\n",
-      "To focus on something new, for his own sake.\n",
-      "He decided to paint, to see how good he could be,\n",
-      "And for a while, it consumed him completely.\n",
-      "\n",
-      "He spent 2014 with brush in hand,\n",
-      "Creating art, in a world so grand.\n",
-      "But then, in the middle of a painting, he lost his drive,\n",
-      "Finishing it felt like a chore, he couldn't revive.\n",
-      "\n",
-      "So he put down his brushes, and turned to writing,\n",
-      "Essays flowed from his mind, thoughts exciting.\n",
-      "But that wasn't enough, he needed more,\n",
-      "And so, he returned to Lisp, a language to explore.\n",
+      "After YC, Paul Graham continued to inspire,\n",
+      "With his wisdom and insights, he aimed higher.\n",
+      "He wrote essays and books, sharing his knowledge,\n",
+      "Guiding entrepreneurs through their startup college.\n",
       "\n",
-      "Lisp, a language of computation and creation,\n",
-      "Brought him back to his coding fascination.\n",
-      "He worked on YC's internal software, with Arc in his hand,\n",
-      "But gradually, his focus shifted, as YC's demands expanded.\n",
+      "He founded a startup incubator, Y Combinator,\n",
+      "Nurturing ideas and fostering innovation galore.\n",
+      "Investing in startups, helping them grow,\n",
+      "Supporting founders, giving them a glow.\n",
       "\n",
-      "He realized that YC had become his life's work,\n",
-      "And it was time for a change, a new perk.\n",
-      "So he handed over the reins to Sam Altman,\n",
-      "And stepped back, as a new chapter began.\n",
+      "Paul Graham's influence extended far and wide,\n",
+      "As he championed the startup ecosystem with pride.\n",
+      "His words resonated, his advice was gold,\n",
+      "Empowering entrepreneurs, making them bold.\n",
       "\n",
-      "Now, he writes and advises, with wisdom to share,\n",
-      "His experiences and insights, he's always there.\n",
-      "Paul Graham, a man of many talents and dreams,\n",
-      "Continues to inspire, through his words and schemes."
+      "So, after YC, Paul Graham's journey did not cease,\n",
+      "He continued to shape the startup world with ease.\n",
+      "A visionary, a mentor, a guiding light,\n",
+      "Paul Graham's impact shines ever so bright."
      ]
     }
    ],
@@ -394,9 +383,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "venv",
+   "display_name": "llama",
    "language": "python",
-   "name": "venv"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -408,7 +397,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.6"
+   "version": "3.9.16"
   }
  },
  "nbformat": 4,

diff --git a/llama_index/chat_engine/condense_question.py b/llama_index/chat_engine/condense_question.py
@@ -1,9 +1,9 @@
 import logging
-from typing import Any, List, Type, Optional
+from typing import Any, List, Optional, Type
 
 from llama_index.chat_engine.types import (
-    BaseChatEngine,
     AgentChatResponse,
+    BaseChatEngine,
     StreamingAgentChatResponse,
 )
 from llama_index.chat_engine.utils import response_gen_with_chat_history
@@ -13,7 +13,7 @@
 from llama_index.llms.generic_utils import messages_to_history_str
 from llama_index.memory import BaseMemory, ChatMemoryBuffer
 from llama_index.prompts.base import Prompt
-from llama_index.response.schema import StreamingResponse, RESPONSE_TYPE
+from llama_index.response.schema import RESPONSE_TYPE, StreamingResponse
 from llama_index.tools import ToolOutput
 
 logger = logging.getLogger(__name__)
@@ -151,8 +151,22 @@ def chat(
         if self._verbose:
             print(log_str)
 
+        # TODO: right now, query engine uses class attribute to configure streaming,
+        #       we are moving towards separate streaming and non-streaming methods.
+        #       In the meanwhile, use this hack to toggle streaming.
+        from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
+
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            is_streaming = self._query_engine._response_synthesizer._streaming
+            self._query_engine._response_synthesizer._streaming = False
+
         # Query with standalone question
         query_response = self._query_engine.query(condensed_question)
+
+        # NOTE: reset streaming flag
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            self._query_engine._response_synthesizer._streaming = is_streaming
+
         tool_output = self._get_tool_output_from_response(
             condensed_question, query_response
         )
@@ -178,8 +192,22 @@ def stream_chat(
         if self._verbose:
             print(log_str)
 
+        # TODO: right now, query engine uses class attribute to configure streaming,
+        #       we are moving towards separate streaming and non-streaming methods.
+        #       In the meanwhile, use this hack to toggle streaming.
+        from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
+
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            is_streaming = self._query_engine._response_synthesizer._streaming
+            self._query_engine._response_synthesizer._streaming = True
+
         # Query with standalone question
         query_response = self._query_engine.query(condensed_question)
+
+        # NOTE: reset streaming flag
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            self._query_engine._response_synthesizer._streaming = is_streaming
+
         tool_output = self._get_tool_output_from_response(
             condensed_question, query_response
         )
@@ -213,8 +241,22 @@ async def achat(
         if self._verbose:
             print(log_str)
 
+        # TODO: right now, query engine uses class attribute to configure streaming,
+        #       we are moving towards separate streaming and non-streaming methods.
+        #       In the meanwhile, use this hack to toggle streaming.
+        from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
+
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            is_streaming = self._query_engine._response_synthesizer._streaming
+            self._query_engine._response_synthesizer._streaming = False
+
         # Query with standalone question
         query_response = await self._query_engine.aquery(condensed_question)
+
+        # NOTE: reset streaming flag
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            self._query_engine._response_synthesizer._streaming = is_streaming
+
         tool_output = self._get_tool_output_from_response(
             condensed_question, query_response
         )
@@ -240,8 +282,22 @@ async def astream_chat(
         if self._verbose:
             print(log_str)
 
+        # TODO: right now, query engine uses class attribute to configure streaming,
+        #       we are moving towards separate streaming and non-streaming methods.
+        #       In the meanwhile, use this hack to toggle streaming.
+        from llama_index.query_engine.retriever_query_engine import RetrieverQueryEngine
+
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            is_streaming = self._query_engine._response_synthesizer._streaming
+            self._query_engine._response_synthesizer._streaming = True
+
         # Query with standalone question
         query_response = await self._query_engine.aquery(condensed_question)
+
+        # NOTE: reset streaming flag
+        if isinstance(self._query_engine, RetrieverQueryEngine):
+            self._query_engine._response_synthesizer._streaming = is_streaming
+
         tool_output = self._get_tool_output_from_response(
             condensed_question, query_response
         )