mirror of
https://github.com/onyx-dot-app/litellm.git
synced 2026-07-01 20:44:04 -04:00
This commit is contained in:
committed by
GitHub
parent
21c3720732
commit
6842d705d5
@@ -477,9 +477,9 @@ except ImportError:
|
||||
server_root_path = os.getenv("SERVER_ROOT_PATH", "")
|
||||
_license_check = LicenseCheck()
|
||||
premium_user: bool = _license_check.is_premium()
|
||||
premium_user_data: Optional["EnterpriseLicenseData"] = (
|
||||
_license_check.airgapped_license_data
|
||||
)
|
||||
premium_user_data: Optional[
|
||||
"EnterpriseLicenseData"
|
||||
] = _license_check.airgapped_license_data
|
||||
global_max_parallel_request_retries_env: Optional[str] = os.getenv(
|
||||
"LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES"
|
||||
)
|
||||
@@ -1006,9 +1006,9 @@ worker_config = None
|
||||
master_key: Optional[str] = None
|
||||
otel_logging = False
|
||||
prisma_client: Optional[PrismaClient] = None
|
||||
shared_aiohttp_session: Optional["ClientSession"] = (
|
||||
None # Global shared session for connection reuse
|
||||
)
|
||||
shared_aiohttp_session: Optional[
|
||||
"ClientSession"
|
||||
] = None # Global shared session for connection reuse
|
||||
user_api_key_cache = DualCache(
|
||||
default_in_memory_ttl=UserAPIKeyCacheTTLEnum.in_memory_cache_ttl.value
|
||||
)
|
||||
@@ -1016,9 +1016,9 @@ model_max_budget_limiter = _PROXY_VirtualKeyModelMaxBudgetLimiter(
|
||||
dual_cache=user_api_key_cache
|
||||
)
|
||||
litellm.logging_callback_manager.add_litellm_callback(model_max_budget_limiter)
|
||||
redis_usage_cache: Optional[RedisCache] = (
|
||||
None # redis cache used for tracking spend, tpm/rpm limits
|
||||
)
|
||||
redis_usage_cache: Optional[
|
||||
RedisCache
|
||||
] = None # redis cache used for tracking spend, tpm/rpm limits
|
||||
user_custom_auth = None
|
||||
user_custom_key_generate = None
|
||||
user_custom_sso = None
|
||||
@@ -1351,9 +1351,9 @@ async def update_cache( # noqa: PLR0915
|
||||
_id = "team_id:{}".format(team_id)
|
||||
try:
|
||||
# Fetch the existing cost for the given user
|
||||
existing_spend_obj: Optional[LiteLLM_TeamTable] = (
|
||||
await user_api_key_cache.async_get_cache(key=_id)
|
||||
)
|
||||
existing_spend_obj: Optional[
|
||||
LiteLLM_TeamTable
|
||||
] = await user_api_key_cache.async_get_cache(key=_id)
|
||||
if existing_spend_obj is None:
|
||||
# do nothing if team not in api key cache
|
||||
return
|
||||
@@ -1518,10 +1518,11 @@ async def _run_background_health_check():
|
||||
|
||||
if shared_health_manager is not None:
|
||||
try:
|
||||
healthy_endpoints, unhealthy_endpoints = (
|
||||
await shared_health_manager.perform_shared_health_check(
|
||||
model_list=_llm_model_list, details=details_bool
|
||||
)
|
||||
(
|
||||
healthy_endpoints,
|
||||
unhealthy_endpoints,
|
||||
) = await shared_health_manager.perform_shared_health_check(
|
||||
model_list=_llm_model_list, details=details_bool
|
||||
)
|
||||
except Exception as e:
|
||||
verbose_proxy_logger.error(
|
||||
@@ -3370,10 +3371,10 @@ class ProxyConfig:
|
||||
)
|
||||
|
||||
try:
|
||||
guardrails_in_db: List[Guardrail] = (
|
||||
await GuardrailRegistry.get_all_guardrails_from_db(
|
||||
prisma_client=prisma_client
|
||||
)
|
||||
guardrails_in_db: List[
|
||||
Guardrail
|
||||
] = await GuardrailRegistry.get_all_guardrails_from_db(
|
||||
prisma_client=prisma_client
|
||||
)
|
||||
verbose_proxy_logger.debug(
|
||||
"guardrails from the DB %s", str(guardrails_in_db)
|
||||
@@ -3603,9 +3604,9 @@ async def initialize( # noqa: PLR0915
|
||||
user_api_base = api_base
|
||||
dynamic_config[user_model]["api_base"] = api_base
|
||||
if api_version:
|
||||
os.environ["AZURE_API_VERSION"] = (
|
||||
api_version # set this for azure - litellm can read this from the env
|
||||
)
|
||||
os.environ[
|
||||
"AZURE_API_VERSION"
|
||||
] = api_version # set this for azure - litellm can read this from the env
|
||||
if max_tokens: # model-specific param
|
||||
dynamic_config[user_model]["max_tokens"] = max_tokens
|
||||
if temperature: # model-specific param
|
||||
@@ -6241,6 +6242,7 @@ async def token_counter(request: TokenCountRequest, call_endpoint: bool = False)
|
||||
pass
|
||||
if deployment is not None:
|
||||
litellm_model_name = deployment.get("litellm_params", {}).get("model")
|
||||
model_info = deployment.get("model_info", {})
|
||||
load_credentials_from_list(deployment.get("litellm_params", {}))
|
||||
# remove the custom_llm_provider_prefix in the litellm_model_name
|
||||
if "/" in litellm_model_name:
|
||||
@@ -8968,9 +8970,9 @@ async def get_config_list(
|
||||
hasattr(sub_field_info, "description")
|
||||
and sub_field_info.description is not None
|
||||
):
|
||||
nested_fields[idx].field_description = (
|
||||
sub_field_info.description
|
||||
)
|
||||
nested_fields[
|
||||
idx
|
||||
].field_description = sub_field_info.description
|
||||
idx += 1
|
||||
|
||||
_stored_in_db = None
|
||||
|
||||
@@ -0,0 +1,219 @@
|
||||
"""
|
||||
Test for custom_tokenizer bug fix.
|
||||
Issue: custom_tokenizer from model_info was not being extracted from deployment,
|
||||
causing token_counter to always use OpenAI tokenizer instead of the configured custom tokenizer.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
import litellm
|
||||
import litellm.proxy.proxy_server
|
||||
from litellm.proxy.proxy_server import token_counter
|
||||
from litellm.proxy._types import TokenCountRequest
|
||||
from litellm import Router
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_custom_tokenizer_from_model_info():
|
||||
"""
|
||||
Test that custom_tokenizer from model_info is correctly used for token counting.
|
||||
|
||||
Real-world scenario: Using intfloat/multilingual-e5-large-instruct tokenizer
|
||||
for a custom embedding model (like Groq-hosted llama model used for embeddings).
|
||||
|
||||
This test reproduces the bug where:
|
||||
- model_info was declared but never populated from deployment
|
||||
- custom_tokenizer was therefore never extracted
|
||||
- token_counter always fell back to OpenAI tokenizer
|
||||
|
||||
Expected behavior:
|
||||
- When a model has custom_tokenizer in model_info
|
||||
- The token_counter should use that custom tokenizer (intfloat/multilingual-e5-large-instruct)
|
||||
- tokenizer_type should reflect "huggingface_tokenizer" not "openai_tokenizer"
|
||||
"""
|
||||
|
||||
# Create a router with a model that has custom_tokenizer for multilingual embeddings
|
||||
# This matches the user's real config with intfloat/multilingual-e5-large-instruct
|
||||
llm_router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "nikro-llama",
|
||||
"litellm_params": {
|
||||
"model": "openai/llama-3.1-8b-instant",
|
||||
"api_base": "https://api.groq.com/openai/v1",
|
||||
},
|
||||
"model_info": {
|
||||
"mode": "embedding",
|
||||
"custom_tokenizer": {
|
||||
"identifier": "intfloat/multilingual-e5-large-instruct",
|
||||
"revision": "main",
|
||||
"auth_token": None,
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
||||
|
||||
# Make a token counting request with a multilingual text sample
|
||||
# This is realistic for the multilingual-e5 model
|
||||
response = await token_counter(
|
||||
request=TokenCountRequest(
|
||||
model="nikro-llama",
|
||||
messages=[
|
||||
{"role": "user", "content": "Hello world! Bonjour le monde! 你好世界!"}
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
print("Response:", response)
|
||||
print("Tokenizer type:", response.tokenizer_type)
|
||||
print("Model used:", response.model_used)
|
||||
print("Total tokens:", response.total_tokens)
|
||||
|
||||
# Verify that custom tokenizer (intfloat/multilingual-e5-large-instruct) was used
|
||||
assert response.tokenizer_type == "huggingface_tokenizer", (
|
||||
f"Expected 'huggingface_tokenizer' (intfloat/multilingual-e5-large-instruct) "
|
||||
f"but got '{response.tokenizer_type}'. "
|
||||
"This indicates the custom_tokenizer from model_info was not used."
|
||||
)
|
||||
assert response.request_model == "nikro-llama"
|
||||
assert response.model_used == "llama-3.1-8b-instant"
|
||||
assert response.total_tokens > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_custom_tokenizer_with_llamacpp():
|
||||
"""
|
||||
Test custom_tokenizer with llamacpp model (similar to user's setup).
|
||||
|
||||
This simulates the user's Docker environment where:
|
||||
- They have a llamacpp model
|
||||
- With custom_tokenizer configured
|
||||
- In Docker, it was using OpenAI tokenizer (bug)
|
||||
- Locally, it was using HuggingFace tokenizer (correct)
|
||||
"""
|
||||
|
||||
llm_router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "my-local-model",
|
||||
"litellm_params": {
|
||||
"model": "openai/my-local-llama",
|
||||
"api_base": "http://localhost:8080/v1",
|
||||
},
|
||||
"model_info": {
|
||||
"custom_tokenizer": {
|
||||
"identifier": "intfloat/multilingual-e5-large-instruct",
|
||||
"revision": "main",
|
||||
"auth_token": None,
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
||||
|
||||
response = await token_counter(
|
||||
request=TokenCountRequest(
|
||||
model="my-local-model",
|
||||
messages=[{"role": "user", "content": "test message"}],
|
||||
)
|
||||
)
|
||||
|
||||
# The bug would cause this to be "openai_tokenizer"
|
||||
assert (
|
||||
response.tokenizer_type == "huggingface_tokenizer"
|
||||
), f"Custom tokenizer not used! Got: {response.tokenizer_type}"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_multilingual_e5_embedding_model():
|
||||
"""
|
||||
Test the exact real-world use case: intfloat/multilingual-e5-large-instruct
|
||||
tokenizer with a custom embedding endpoint.
|
||||
|
||||
This is the user's actual production scenario:
|
||||
- Custom embedding model endpoint (could be llama.cpp, vLLM, etc.)
|
||||
- Using intfloat/multilingual-e5-large-instruct for tokenization
|
||||
- Model served via OpenAI-compatible API
|
||||
"""
|
||||
|
||||
llm_router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "my-embedding-model",
|
||||
"litellm_params": {
|
||||
"model": "openai/custom-embedding-model",
|
||||
"api_base": "http://localhost:8080/v1",
|
||||
},
|
||||
"model_info": {
|
||||
"mode": "embedding",
|
||||
"custom_tokenizer": {
|
||||
"identifier": "intfloat/multilingual-e5-large-instruct",
|
||||
"revision": "main",
|
||||
"auth_token": None,
|
||||
},
|
||||
},
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
||||
|
||||
# Test with multilingual content (what e5-large-instruct is designed for)
|
||||
response = await token_counter(
|
||||
request=TokenCountRequest(
|
||||
model="my-embedding-model",
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": "This is a multilingual test. C'est un test multilingue. 这是一个多语言测试。",
|
||||
}
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
print(
|
||||
f"Embedding model test - Tokenizer: {response.tokenizer_type}, Tokens: {response.total_tokens}"
|
||||
)
|
||||
|
||||
# Must use HuggingFace tokenizer with intfloat/multilingual-e5-large-instruct
|
||||
assert response.tokenizer_type == "huggingface_tokenizer", (
|
||||
f"The intfloat/multilingual-e5-large-instruct tokenizer was not used! "
|
||||
f"Got: {response.tokenizer_type}"
|
||||
)
|
||||
assert response.total_tokens > 0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_model_without_custom_tokenizer_uses_default():
|
||||
"""
|
||||
Test that models without custom_tokenizer still work correctly.
|
||||
"""
|
||||
|
||||
llm_router = Router(
|
||||
model_list=[
|
||||
{
|
||||
"model_name": "gpt-4",
|
||||
"litellm_params": {
|
||||
"model": "gpt-4",
|
||||
},
|
||||
"model_info": {}, # No custom_tokenizer
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
|
||||
|
||||
response = await token_counter(
|
||||
request=TokenCountRequest(
|
||||
model="gpt-4",
|
||||
messages=[{"role": "user", "content": "hello"}],
|
||||
)
|
||||
)
|
||||
|
||||
# Should use OpenAI tokenizer for GPT-4
|
||||
assert response.tokenizer_type == "openai_tokenizer"
|
||||
assert response.model_used == "gpt-4"
|
||||
Reference in New Issue
Block a user