onyx-dot-app · evan-danswer · Mar 7, 2025 · Mar 6, 2025 · Mar 6, 2025 · Mar 6, 2025
@@ -31,6 +31,7 @@
     get_langgraph_node_log_string,
 )
 from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_VALIDATION
 from onyx.configs.agent_configs import AGENT_TIMEOUT_CONNECT_LLM_SUBANSWER_CHECK
 from onyx.configs.agent_configs import AGENT_TIMEOUT_LLM_SUBANSWER_CHECK
 from onyx.llm.chat_llm import LLMRateLimitError
@@ -92,6 +93,7 @@ def check_sub_answer(
             fast_llm.invoke,
             prompt=msg,
             timeout_override=AGENT_TIMEOUT_CONNECT_LLM_SUBANSWER_CHECK,
+            max_tokens=AGENT_MAX_TOKENS_VALIDATION,
         )
 
         quality_str: str = cast(str, response.content)

@@ -46,6 +46,7 @@
 from onyx.chat.models import StreamStopReason
 from onyx.chat.models import StreamType
 from onyx.configs.agent_configs import AGENT_MAX_ANSWER_CONTEXT_DOCS
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_SUBANSWER_GENERATION
 from onyx.configs.agent_configs import AGENT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION
 from onyx.configs.agent_configs import AGENT_TIMEOUT_LLM_SUBANSWER_GENERATION
 from onyx.llm.chat_llm import LLMRateLimitError
@@ -119,6 +120,7 @@ def stream_sub_answer() -> list[str]:
             for message in fast_llm.stream(
                 prompt=msg,
                 timeout_override=AGENT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION,
+                max_tokens=AGENT_MAX_TOKENS_SUBANSWER_GENERATION,
             ):
                 # TODO: in principle, the answer here COULD contain images, but we don't support that yet
                 content = message.content

@@ -43,6 +43,7 @@
 from onyx.agents.agent_search.shared_graph_utils.operators import (
     dedup_inference_section_list,
 )
+from onyx.agents.agent_search.shared_graph_utils.utils import _should_restrict_tokens
 from onyx.agents.agent_search.shared_graph_utils.utils import (
     dispatch_main_answer_stop_info,
 )
@@ -62,6 +63,7 @@
 from onyx.configs.agent_configs import AGENT_ANSWER_GENERATION_BY_FAST_LLM
 from onyx.configs.agent_configs import AGENT_MAX_ANSWER_CONTEXT_DOCS
 from onyx.configs.agent_configs import AGENT_MAX_STREAMED_DOCS_FOR_INITIAL_ANSWER
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_ANSWER_GENERATION
 from onyx.configs.agent_configs import AGENT_MIN_ORIG_QUESTION_DOCS
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION,
@@ -278,6 +280,9 @@ def stream_initial_answer() -> list[str]:
             for message in model.stream(
                 msg,
                 timeout_override=AGENT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION,
+                max_tokens=AGENT_MAX_TOKENS_ANSWER_GENERATION
+                if _should_restrict_tokens(model.config)
+                else None,
             ):
                 # TODO: in principle, the answer here COULD contain images, but we don't support that yet
                 content = message.content

@@ -34,6 +34,7 @@
 from onyx.chat.models import StreamStopReason
 from onyx.chat.models import StreamType
 from onyx.chat.models import SubQuestionPiece
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_SUBQUESTION_GENERATION
 from onyx.configs.agent_configs import AGENT_NUM_DOCS_FOR_DECOMPOSITION
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_SUBQUESTION_GENERATION,
@@ -141,6 +142,7 @@ def decompose_orig_question(
             model.stream(
                 msg,
                 timeout_override=AGENT_TIMEOUT_CONNECT_LLM_SUBQUESTION_GENERATION,
+                max_tokens=AGENT_MAX_TOKENS_SUBQUESTION_GENERATION,
             ),
             dispatch_subquestion(0, writer),
             sep_callback=dispatch_subquestion_sep(0, writer),

@@ -33,6 +33,7 @@
 )
 from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
 from onyx.chat.models import RefinedAnswerImprovement
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_VALIDATION
 from onyx.configs.agent_configs import AGENT_TIMEOUT_CONNECT_LLM_COMPARE_ANSWERS
 from onyx.configs.agent_configs import AGENT_TIMEOUT_LLM_COMPARE_ANSWERS
 from onyx.llm.chat_llm import LLMRateLimitError
@@ -112,6 +113,7 @@ def compare_answers(
             model.invoke,
             prompt=msg,
             timeout_override=AGENT_TIMEOUT_CONNECT_LLM_COMPARE_ANSWERS,
+            max_tokens=AGENT_MAX_TOKENS_VALIDATION,
         )
 
     except (LLMTimeoutError, TimeoutError):

@@ -43,6 +43,7 @@
 from onyx.agents.agent_search.shared_graph_utils.utils import make_question_id
 from onyx.agents.agent_search.shared_graph_utils.utils import write_custom_event
 from onyx.chat.models import StreamingError
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_SUBQUESTION_GENERATION
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_REFINED_SUBQUESTION_GENERATION,
 )
@@ -144,6 +145,7 @@ def create_refined_sub_questions(
             model.stream(
                 msg,
                 timeout_override=AGENT_TIMEOUT_CONNECT_LLM_REFINED_SUBQUESTION_GENERATION,
+                max_tokens=AGENT_MAX_TOKENS_SUBQUESTION_GENERATION,
             ),
             dispatch_subquestion(1, writer),
             sep_callback=dispatch_subquestion_sep(1, writer),

@@ -50,13 +50,7 @@ def decide_refinement_need(
         )
     ]
 
-    if graph_config.behavior.allow_refinement:
-        return RequireRefinemenEvalUpdate(
-            require_refined_answer_eval=decision,
-            log_messages=log_messages,
-        )
-    else:
-        return RequireRefinemenEvalUpdate(
-            require_refined_answer_eval=False,
-            log_messages=log_messages,
-        )
+    return RequireRefinemenEvalUpdate(
+        require_refined_answer_eval=graph_config.behavior.allow_refinement and decision,
+        log_messages=log_messages,
+    )
@@ -21,6 +21,7 @@
 from onyx.agents.agent_search.shared_graph_utils.utils import (
     get_langgraph_node_log_string,
 )
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_ENTITY_TERM_EXTRACTION
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_ENTITY_TERM_EXTRACTION,
 )
@@ -96,6 +97,7 @@ def extract_entities_terms(
             fast_llm.invoke,
             prompt=msg,
             timeout_override=AGENT_TIMEOUT_CONNECT_LLM_ENTITY_TERM_EXTRACTION,
+            max_tokens=AGENT_MAX_TOKENS_ENTITY_TERM_EXTRACTION,
         )
 
         cleaned_response = (

@@ -46,6 +46,7 @@
 from onyx.agents.agent_search.shared_graph_utils.operators import (
     dedup_inference_section_list,
 )
+from onyx.agents.agent_search.shared_graph_utils.utils import _should_restrict_tokens
 from onyx.agents.agent_search.shared_graph_utils.utils import (
     dispatch_main_answer_stop_info,
 )
@@ -68,6 +69,8 @@
 from onyx.configs.agent_configs import AGENT_ANSWER_GENERATION_BY_FAST_LLM
 from onyx.configs.agent_configs import AGENT_MAX_ANSWER_CONTEXT_DOCS
 from onyx.configs.agent_configs import AGENT_MAX_STREAMED_DOCS_FOR_REFINED_ANSWER
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_ANSWER_GENERATION
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_VALIDATION
 from onyx.configs.agent_configs import AGENT_MIN_ORIG_QUESTION_DOCS
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION,
@@ -302,7 +305,11 @@ def generate_validate_refined_answer(
 
     def stream_refined_answer() -> list[str]:
         for message in model.stream(
-            msg, timeout_override=AGENT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION
+            msg,
+            timeout_override=AGENT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION,
+            max_tokens=AGENT_MAX_TOKENS_ANSWER_GENERATION
+            if _should_restrict_tokens(model.config)
+            else None,
         ):
             # TODO: in principle, the answer here COULD contain images, but we don't support that yet
             content = message.content
@@ -409,6 +416,7 @@ def stream_refined_answer() -> list[str]:
             validation_model.invoke,
             prompt=msg,
             timeout_override=AGENT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_VALIDATION,
+            max_tokens=AGENT_MAX_TOKENS_VALIDATION,
         )
         refined_answer_quality = binary_string_test_after_answer_separator(
             text=cast(str, validation_response.content),

@@ -33,6 +33,7 @@
     get_langgraph_node_log_string,
 )
 from onyx.agents.agent_search.shared_graph_utils.utils import parse_question_id
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_SUBQUERY_GENERATION
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_QUERY_REWRITING_GENERATION,
 )
@@ -96,6 +97,7 @@ def expand_queries(
             model.stream(
                 prompt=msg,
                 timeout_override=AGENT_TIMEOUT_CONNECT_LLM_QUERY_REWRITING_GENERATION,
+                max_tokens=AGENT_MAX_TOKENS_SUBQUERY_GENERATION,
             ),
             dispatch_subquery(level, question_num, writer),
         )

@@ -25,6 +25,7 @@
 from onyx.agents.agent_search.shared_graph_utils.utils import (
     get_langgraph_node_log_string,
 )
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_VALIDATION
 from onyx.configs.agent_configs import AGENT_TIMEOUT_CONNECT_LLM_DOCUMENT_VERIFICATION
 from onyx.configs.agent_configs import AGENT_TIMEOUT_LLM_DOCUMENT_VERIFICATION
 from onyx.llm.chat_llm import LLMRateLimitError
@@ -93,6 +94,7 @@ def verify_documents(
             fast_llm.invoke,
             prompt=msg,
             timeout_override=AGENT_TIMEOUT_CONNECT_LLM_DOCUMENT_VERIFICATION,
+            max_tokens=AGENT_MAX_TOKENS_VALIDATION,
         )
 
         assert isinstance(response.content, str)

@@ -42,6 +42,7 @@
 from onyx.chat.models import StreamStopReason
 from onyx.chat.models import StreamType
 from onyx.chat.prompt_builder.answer_prompt_builder import AnswerPromptBuilder
+from onyx.configs.agent_configs import AGENT_MAX_TOKENS_HISTORY_SUMMARY
 from onyx.configs.agent_configs import (
     AGENT_TIMEOUT_CONNECT_LLM_HISTORY_SUMMARY_GENERATION,
 )
@@ -61,6 +62,7 @@
 from onyx.llm.chat_llm import LLMRateLimitError
 from onyx.llm.chat_llm import LLMTimeoutError
 from onyx.llm.interfaces import LLM
+from onyx.llm.interfaces import LLMConfig
 from onyx.prompts.agent_search import (
     ASSISTANT_SYSTEM_PROMPT_DEFAULT,
 )
@@ -402,6 +404,7 @@ def summarize_history(
             llm.invoke,
             history_context_prompt,
             timeout_override=AGENT_TIMEOUT_CONNECT_LLM_HISTORY_SUMMARY_GENERATION,
+            max_tokens=AGENT_MAX_TOKENS_HISTORY_SUMMARY,
         )
     except (LLMTimeoutError, TimeoutError):
         logger.error("LLM Timeout Error - summarize history")
@@ -505,3 +508,9 @@ def get_deduplicated_structured_subquestion_documents(
         cited_documents=dedup_inference_section_list(cited_docs),
         context_documents=dedup_inference_section_list(context_docs),
     )
+
+
+def _should_restrict_tokens(llm_config: LLMConfig) -> bool:
+    return not (
+        llm_config.model_provider == "openai" and llm_config.model_name.startswith("o")
+    )
@@ -217,20 +217,20 @@
 )
 
 
-AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION = 4  # in seconds
+AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION = 6  # in seconds
 AGENT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION = int(
     os.environ.get("AGENT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION")
     or AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_SUBANSWER_GENERATION
 )
 
-AGENT_DEFAULT_TIMEOUT_LLM_SUBANSWER_GENERATION = 30  # in seconds
+AGENT_DEFAULT_TIMEOUT_LLM_SUBANSWER_GENERATION = 40  # in seconds
 AGENT_TIMEOUT_LLM_SUBANSWER_GENERATION = int(
     os.environ.get("AGENT_TIMEOUT_LLM_SUBANSWER_GENERATION")
     or AGENT_DEFAULT_TIMEOUT_LLM_SUBANSWER_GENERATION
 )
 
 
-AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION = 5  # in seconds
+AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION = 10  # in seconds
 AGENT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION = int(
     os.environ.get("AGENT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION")
     or AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_INITIAL_ANSWER_GENERATION
@@ -243,13 +243,13 @@
 )
 
 
-AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION = 5  # in seconds
+AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION = 15  # in seconds
 AGENT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION = int(
     os.environ.get("AGENT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION")
     or AGENT_DEFAULT_TIMEOUT_CONNECT_LLM_REFINED_ANSWER_GENERATION
 )
 
-AGENT_DEFAULT_TIMEOUT_LLM_REFINED_ANSWER_GENERATION = 30  # in seconds
+AGENT_DEFAULT_TIMEOUT_LLM_REFINED_ANSWER_GENERATION = 45  # in seconds
 AGENT_TIMEOUT_LLM_REFINED_ANSWER_GENERATION = int(
     os.environ.get("AGENT_TIMEOUT_LLM_REFINED_ANSWER_GENERATION")
     or AGENT_DEFAULT_TIMEOUT_LLM_REFINED_ANSWER_GENERATION
@@ -333,4 +333,45 @@
     or AGENT_DEFAULT_TIMEOUT_LLM_REFINED_ANSWER_VALIDATION
 )
 
+AGENT_DEFAULT_MAX_TOKENS_VALIDATION = 4
+AGENT_MAX_TOKENS_VALIDATION = int(
+    os.environ.get("AGENT_MAX_TOKENS_VALIDATION") or AGENT_DEFAULT_MAX_TOKENS_VALIDATION
+)
+
+AGENT_DEFAULT_MAX_TOKENS_SUBANSWER_GENERATION = 256
+AGENT_MAX_TOKENS_SUBANSWER_GENERATION = int(
+    os.environ.get("AGENT_MAX_TOKENS_SUBANSWER_GENERATION")
+    or AGENT_DEFAULT_MAX_TOKENS_SUBANSWER_GENERATION
+)
+
+AGENT_DEFAULT_MAX_TOKENS_ANSWER_GENERATION = 1024
+AGENT_MAX_TOKENS_ANSWER_GENERATION = int(
+    os.environ.get("AGENT_MAX_TOKENS_INITIAL_ANSWER_GENERATION")
-    os.environ.get("AGENT_MAX_TOKENS_INITIAL_ANSWER_GENERATION")
+    os.environ.get("AGENT_MAX_TOKENS_ANSWER_GENERATION")
-    os.environ.get("AGENT_MAX_TOKENS_INITIAL_ANSWER_GENERATION")
+    os.environ.get("AGENT_MAX_TOKENS_ANSWER_GENERATION")
+    or AGENT_DEFAULT_MAX_TOKENS_ANSWER_GENERATION
+)
+
+AGENT_DEFAULT_MAX_TOKENS_SUBQUESTION_GENERATION = 256
+AGENT_MAX_TOKENS_SUBQUESTION_GENERATION = int(
+    os.environ.get("AGENT_MAX_TOKENS_SUBQUESTION_GENERATION")
+    or AGENT_DEFAULT_MAX_TOKENS_SUBQUESTION_GENERATION
+)
+
+AGENT_DEFAULT_MAX_TOKENS_ENTITY_TERM_EXTRACTION = 1024
+AGENT_MAX_TOKENS_ENTITY_TERM_EXTRACTION = int(
+    os.environ.get("AGENT_MAX_TOKENS_ENTITY_TERM_EXTRACTION")
+    or AGENT_DEFAULT_MAX_TOKENS_ENTITY_TERM_EXTRACTION
+)
+
+AGENT_DEFAULT_MAX_TOKENS_SUBQUERY_GENERATION = 64
+AGENT_MAX_TOKENS_SUBQUERY_GENERATION = int(
+    os.environ.get("AGENT_MAX_TOKENS_SUBQUERY_GENERATION")
+    or AGENT_DEFAULT_MAX_TOKENS_SUBQUERY_GENERATION
+)
+
+AGENT_DEFAULT_MAX_TOKENS_HISTORY_SUMMARY = 128
+AGENT_MAX_TOKENS_HISTORY_SUMMARY = int(
+    os.environ.get("AGENT_MAX_TOKENS_HISTORY_SUMMARY")
+    or AGENT_DEFAULT_MAX_TOKENS_HISTORY_SUMMARY
+)
+
 GRAPH_VERSION_NAME: str = "a"
@@ -167,7 +167,7 @@ def _convert_delta_to_message_chunk(
     stop_reason: str | None = None,
 ) -> BaseMessageChunk:
     """Adapted from langchain_community.chat_models.litellm._convert_delta_to_message_chunk"""
-    role = _dict.get("role") or (_base_msg_to_role(curr_msg) if curr_msg else None)
+    role = _dict.get("role") or (_base_msg_to_role(curr_msg) if curr_msg else "unknown")
     content = _dict.get("content") or ""
     additional_kwargs = {}
     if _dict.get("function_call"):
@@ -402,6 +402,7 @@ def _completion(
         stream: bool,
         structured_response_format: dict | None = None,
         timeout_override: int | None = None,
+        max_tokens: int | None = None,
     ) -> litellm.ModelResponse | litellm.CustomStreamWrapper:
         # litellm doesn't accept LangChain BaseMessage objects, so we need to convert them
         # to a dict representation
@@ -429,6 +430,7 @@ def _completion(
                 # model params
                 temperature=0,
                 timeout=timeout_override or self._timeout,
+                max_tokens=max_tokens,
                 # For now, we don't support parallel tool calls
                 # NOTE: we can't pass this in if tools are not specified
                 # or else OpenAI throws an error
@@ -484,6 +486,7 @@ def _invoke_implementation(
         tool_choice: ToolChoiceOptions | None = None,
         structured_response_format: dict | None = None,
         timeout_override: int | None = None,
+        max_tokens: int | None = None,
     ) -> BaseMessage:
         if LOG_DANSWER_MODEL_INTERACTIONS:
             self.log_model_configs()
@@ -497,6 +500,7 @@ def _invoke_implementation(
                 stream=False,
                 structured_response_format=structured_response_format,
                 timeout_override=timeout_override,
+                max_tokens=max_tokens,
             ),
         )
         choice = response.choices[0]
@@ -515,6 +519,7 @@ def _stream_implementation(
         tool_choice: ToolChoiceOptions | None = None,
         structured_response_format: dict | None = None,
         timeout_override: int | None = None,
+        max_tokens: int | None = None,
     ) -> Iterator[BaseMessage]:
         if LOG_DANSWER_MODEL_INTERACTIONS:
             self.log_model_configs()
@@ -539,6 +544,7 @@ def _stream_implementation(
                 stream=True,
                 structured_response_format=structured_response_format,
                 timeout_override=timeout_override,
+                max_tokens=max_tokens,
             ),
         )
         try: