Skip to content

Commit

Permalink
Litellm dev 01 27 2025 p3 (#8047)
Browse files Browse the repository at this point in the history
* docs(reliability.md): add doc on disabling fallbacks per request

* feat(litellm_pre_call_utils.py): support reading request timeout from request headers - new `x-litellm-timeout` param

Allows setting dynamic model timeouts from vercel's AI sdk

* test(test_proxy_server.py): add simple unit test for reading request timeout

* test(test_fallbacks.py): add e2e test to confirm timeout passed in request headers is correctly read

* feat(main.py): support passing metadata to openai in preview

Resolves #6022 (comment)

* fix(main.py): fix passing openai metadata

* docs(request_headers.md): document new request headers

* build: Merge branch 'main' into litellm_dev_01_27_2025_p3

* test: loosen test
  • Loading branch information
krrishdholakia authored and ishaan-jaff committed Jan 30, 2025
1 parent e0fcc06 commit 0eb23f6
Show file tree
Hide file tree
Showing 11 changed files with 196 additions and 8 deletions.
34 changes: 32 additions & 2 deletions docs/my-website/docs/proxy/reliability.md
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,34 @@ curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
}'
```

### Disable Fallbacks per key
### Disable Fallbacks (Per Request/Key)


<Tabs>

<TabItem value="request" label="Per Request">

You can disable fallbacks per key by setting `disable_fallbacks: true` in your request body.

```bash
curl -L -X POST 'http://0.0.0.0:4000/v1/chat/completions' \
-H 'Content-Type: application/json' \
-H 'Authorization: Bearer sk-1234' \
-d '{
"messages": [
{
"role": "user",
"content": "List 5 important events in the XIX century"
}
],
"model": "gpt-3.5-turbo",
"disable_fallbacks": true # 👈 DISABLE FALLBACKS
}'
```

</TabItem>

<TabItem value="key" label="Per Key">

You can disable fallbacks per key by setting `disable_fallbacks: true` in your key metadata.

Expand All @@ -1020,4 +1047,7 @@ curl -L -X POST 'http://0.0.0.0:4000/key/generate' \
"disable_fallbacks": true
}
}'
```
```

</TabItem>
</Tabs>
12 changes: 12 additions & 0 deletions docs/my-website/docs/proxy/request_headers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Request Headers

Special headers that are supported by LiteLLM.

## LiteLLM Headers

`x-litellm-timeout` Optional[float]: The timeout for the request in seconds.

## Anthropic Headers

`anthropic-version` Optional[str]: The version of the Anthropic API to use.
`anthropic-beta` Optional[str]: The beta version of the Anthropic API to use.
1 change: 1 addition & 0 deletions docs/my-website/sidebars.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ const sidebars = {
"proxy/user_keys",
"proxy/clientside_auth",
"proxy/response_headers",
"proxy/request_headers",
],
},
{
Expand Down
6 changes: 6 additions & 0 deletions litellm/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
CustomStreamWrapper,
ProviderConfigManager,
Usage,
add_openai_metadata,
async_mock_completion_streaming_obj,
convert_to_model_response_object,
create_pretrained_tokenizer,
Expand Down Expand Up @@ -1586,6 +1587,11 @@ def completion( # type: ignore # noqa: PLR0915
if extra_headers is not None:
optional_params["extra_headers"] = extra_headers

if (
litellm.enable_preview_features and metadata is not None
): # [PREVIEW] allow metadata to be passed to OPENAI
optional_params["metadata"] = add_openai_metadata(metadata)

## LOAD CONFIG - if set
config = litellm.OpenAIConfig.get_config()
for k, v in config.items():
Expand Down
16 changes: 10 additions & 6 deletions litellm/proxy/_new_secret_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@ model_list:
num_retries: 0
- model_name: anthropic-claude
litellm_params:
model: anthropic.claude-3-sonnet-20240229-v1:0

litellm_settings:
callbacks: ["langsmith"]
default_internal_user_params:
available_teams: ["litellm_dashboard_54a81fa9-9c69-45e8-b256-0c36bf104e5f", "a29a2dc6-1347-4ebc-a428-e6b56bbba611", "test-group-12"]
model: claude-3-5-haiku-20241022
- model_name: groq/*
litellm_params:
model: groq/*
api_key: os.environ/GROQ_API_KEY
mock_response: Hi!
- model_name: deepseek/*
litellm_params:
model: deepseek/*
api_key: os.environ/DEEPSEEK_API_KEY
1 change: 1 addition & 0 deletions litellm/proxy/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -2194,6 +2194,7 @@ class SpecialHeaders(enum.Enum):
class LitellmDataForBackendLLMCall(TypedDict, total=False):
headers: dict
organization: str
timeout: Optional[float]


class JWTKeyItem(TypedDict, total=False):
Expand Down
30 changes: 30 additions & 0 deletions litellm/proxy/litellm_pre_call_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,31 @@ def clean_headers(


class LiteLLMProxyRequestSetup:
@staticmethod
def _get_timeout_from_request(headers: dict) -> Optional[float]:
"""
Workaround for client request from Vercel's AI SDK.
Allow's user to set a timeout in the request headers.
Example:
```js
const openaiProvider = createOpenAI({
baseURL: liteLLM.baseURL,
apiKey: liteLLM.apiKey,
compatibility: "compatible",
headers: {
"x-litellm-timeout": "90"
},
});
```
"""
timeout_header = headers.get("x-litellm-timeout", None)
if timeout_header is not None:
return float(timeout_header)
return None

@staticmethod
def _get_forwardable_headers(
headers: Union[Headers, dict],
Expand Down Expand Up @@ -267,6 +292,11 @@ def add_litellm_data_for_backend_llm_call(
)
if _organization is not None:
data["organization"] = _organization

timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
if timeout is not None:
data["timeout"] = timeout

return data

@staticmethod
Expand Down
18 changes: 18 additions & 0 deletions litellm/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6223,3 +6223,21 @@ def get_non_default_completion_params(kwargs: dict) -> dict:
k: v for k, v in kwargs.items() if k not in default_params
} # model-specific params - pass them straight to the model/provider
return non_default_params


def add_openai_metadata(metadata: dict) -> dict:
"""
Add metadata to openai optional parameters, excluding hidden params
Args:
params (dict): Dictionary of API parameters
metadata (dict, optional): Metadata to include in the request
Returns:
dict: Updated parameters dictionary with visible metadata only
"""
if metadata is None:
return None
# Only include non-hidden parameters
visible_metadata = {k: v for k, v in metadata.items() if k != "hidden_params"}
return visible_metadata.copy()
34 changes: 34 additions & 0 deletions tests/local_testing/test_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -4580,3 +4580,37 @@ def test_provider_specific_header(custom_llm_provider, expected_result):
mock_post.assert_called_once()
print(mock_post.call_args.kwargs["headers"])
assert "anthropic-beta" in mock_post.call_args.kwargs["headers"]


@pytest.mark.parametrize(
"enable_preview_features",
[True, False],
)
def test_completion_openai_metadata(monkeypatch, enable_preview_features):
from openai import OpenAI

client = OpenAI()

litellm.set_verbose = True

monkeypatch.setattr(litellm, "enable_preview_features", enable_preview_features)
with patch.object(
client.chat.completions.with_raw_response, "create", return_value=MagicMock()
) as mock_completion:
try:
resp = litellm.completion(
model="openai/gpt-3.5-turbo",
messages=[{"role": "user", "content": "Hello world"}],
metadata={"my-test-key": "my-test-value"},
client=client,
)
except Exception as e:
print(f"Error: {e}")

mock_completion.assert_called_once()
if enable_preview_features:
assert mock_completion.call_args.kwargs["metadata"] == {
"my-test-key": "my-test-value"
}
else:
assert "metadata" not in mock_completion.call_args.kwargs
16 changes: 16 additions & 0 deletions tests/proxy_unit_tests/test_proxy_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -2190,3 +2190,19 @@ async def test_get_ui_settings_spend_logs_threshold():

# Clean up
proxy_state.set_proxy_state_variable("spend_logs_row_count", 0)


def test_get_timeout_from_request():
from litellm.proxy.litellm_pre_call_utils import LiteLLMProxyRequestSetup

headers = {
"x-litellm-timeout": "90",
}
timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
assert timeout == 90

headers = {
"x-litellm-timeout": "90.5",
}
timeout = LiteLLMProxyRequestSetup._get_timeout_from_request(headers)
assert timeout == 90.5
36 changes: 36 additions & 0 deletions tests/test_fallbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import aiohttp
from large_text import text
import time
from typing import Optional


async def generate_key(
Expand Down Expand Up @@ -44,13 +45,16 @@ async def chat_completion(
model: str,
messages: list,
return_headers: bool = False,
extra_headers: Optional[dict] = None,
**kwargs,
):
url = "http://0.0.0.0:4000/chat/completions"
headers = {
"Authorization": f"Bearer {key}",
"Content-Type": "application/json",
}
if extra_headers is not None:
headers.update(extra_headers)
data = {"model": model, "messages": messages, **kwargs}

async with session.post(url, headers=headers, json=data) as response:
Expand Down Expand Up @@ -180,6 +184,38 @@ async def test_chat_completion_with_timeout():
) # assert model-specific timeout used


@pytest.mark.asyncio
async def test_chat_completion_with_timeout_from_request():
"""
make chat completion call with low timeout and `mock_timeout`: true. Expect it to fail and correct timeout to be set in headers.
"""
async with aiohttp.ClientSession() as session:
model = "fake-openai-endpoint-5"
messages = [
{"role": "system", "content": text},
{"role": "user", "content": "Who was Alexander?"},
]
extra_headers = {
"x-litellm-timeout": "0.001",
}
start_time = time.time()
response, headers = await chat_completion(
session=session,
key="sk-1234",
model=model,
messages=messages,
num_retries=0,
mock_timeout=True,
extra_headers=extra_headers,
return_headers=True,
)
end_time = time.time()
print(f"headers: {headers}")
assert (
headers["x-litellm-timeout"] == "0.001"
) # assert model-specific timeout used


@pytest.mark.parametrize("has_access", [True, False])
@pytest.mark.asyncio
async def test_chat_completion_client_fallbacks_with_custom_message(has_access):
Expand Down

0 comments on commit 0eb23f6

Please sign in to comment.