fix(token-counter): extract model_info from deployment for custom_tokenizer (#15657) (#15680)

This commit is contained in:
Nagailic Sergiu (Nikro)
2025-10-18 04:38:45 +02:00
committed by GitHub
parent 21c3720732
commit 6842d705d5
2 changed files with 247 additions and 26 deletions
+28 -26
View File
@@ -477,9 +477,9 @@ except ImportError:
server_root_path = os.getenv("SERVER_ROOT_PATH", "")
_license_check = LicenseCheck()
premium_user: bool = _license_check.is_premium()
premium_user_data: Optional["EnterpriseLicenseData"] = (
_license_check.airgapped_license_data
)
premium_user_data: Optional[
"EnterpriseLicenseData"
] = _license_check.airgapped_license_data
global_max_parallel_request_retries_env: Optional[str] = os.getenv(
"LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES"
)
@@ -1006,9 +1006,9 @@ worker_config = None
master_key: Optional[str] = None
otel_logging = False
prisma_client: Optional[PrismaClient] = None
shared_aiohttp_session: Optional["ClientSession"] = (
None # Global shared session for connection reuse
)
shared_aiohttp_session: Optional[
"ClientSession"
] = None # Global shared session for connection reuse
user_api_key_cache = DualCache(
default_in_memory_ttl=UserAPIKeyCacheTTLEnum.in_memory_cache_ttl.value
)
@@ -1016,9 +1016,9 @@ model_max_budget_limiter = _PROXY_VirtualKeyModelMaxBudgetLimiter(
dual_cache=user_api_key_cache
)
litellm.logging_callback_manager.add_litellm_callback(model_max_budget_limiter)
redis_usage_cache: Optional[RedisCache] = (
None # redis cache used for tracking spend, tpm/rpm limits
)
redis_usage_cache: Optional[
RedisCache
] = None # redis cache used for tracking spend, tpm/rpm limits
user_custom_auth = None
user_custom_key_generate = None
user_custom_sso = None
@@ -1351,9 +1351,9 @@ async def update_cache( # noqa: PLR0915
_id = "team_id:{}".format(team_id)
try:
# Fetch the existing cost for the given user
existing_spend_obj: Optional[LiteLLM_TeamTable] = (
await user_api_key_cache.async_get_cache(key=_id)
)
existing_spend_obj: Optional[
LiteLLM_TeamTable
] = await user_api_key_cache.async_get_cache(key=_id)
if existing_spend_obj is None:
# do nothing if team not in api key cache
return
@@ -1518,10 +1518,11 @@ async def _run_background_health_check():
if shared_health_manager is not None:
try:
healthy_endpoints, unhealthy_endpoints = (
await shared_health_manager.perform_shared_health_check(
model_list=_llm_model_list, details=details_bool
)
(
healthy_endpoints,
unhealthy_endpoints,
) = await shared_health_manager.perform_shared_health_check(
model_list=_llm_model_list, details=details_bool
)
except Exception as e:
verbose_proxy_logger.error(
@@ -3370,10 +3371,10 @@ class ProxyConfig:
)
try:
guardrails_in_db: List[Guardrail] = (
await GuardrailRegistry.get_all_guardrails_from_db(
prisma_client=prisma_client
)
guardrails_in_db: List[
Guardrail
] = await GuardrailRegistry.get_all_guardrails_from_db(
prisma_client=prisma_client
)
verbose_proxy_logger.debug(
"guardrails from the DB %s", str(guardrails_in_db)
@@ -3603,9 +3604,9 @@ async def initialize( # noqa: PLR0915
user_api_base = api_base
dynamic_config[user_model]["api_base"] = api_base
if api_version:
os.environ["AZURE_API_VERSION"] = (
api_version # set this for azure - litellm can read this from the env
)
os.environ[
"AZURE_API_VERSION"
] = api_version # set this for azure - litellm can read this from the env
if max_tokens: # model-specific param
dynamic_config[user_model]["max_tokens"] = max_tokens
if temperature: # model-specific param
@@ -6241,6 +6242,7 @@ async def token_counter(request: TokenCountRequest, call_endpoint: bool = False)
pass
if deployment is not None:
litellm_model_name = deployment.get("litellm_params", {}).get("model")
model_info = deployment.get("model_info", {})
load_credentials_from_list(deployment.get("litellm_params", {}))
# remove the custom_llm_provider_prefix in the litellm_model_name
if "/" in litellm_model_name:
@@ -8968,9 +8970,9 @@ async def get_config_list(
hasattr(sub_field_info, "description")
and sub_field_info.description is not None
):
nested_fields[idx].field_description = (
sub_field_info.description
)
nested_fields[
idx
].field_description = sub_field_info.description
idx += 1
_stored_in_db = None
@@ -0,0 +1,219 @@
"""
Test for custom_tokenizer bug fix.
Issue: custom_tokenizer from model_info was not being extracted from deployment,
causing token_counter to always use OpenAI tokenizer instead of the configured custom tokenizer.
"""
import pytest
import litellm
import litellm.proxy.proxy_server
from litellm.proxy.proxy_server import token_counter
from litellm.proxy._types import TokenCountRequest
from litellm import Router
@pytest.mark.asyncio
async def test_custom_tokenizer_from_model_info():
"""
Test that custom_tokenizer from model_info is correctly used for token counting.
Real-world scenario: Using intfloat/multilingual-e5-large-instruct tokenizer
for a custom embedding model (like Groq-hosted llama model used for embeddings).
This test reproduces the bug where:
- model_info was declared but never populated from deployment
- custom_tokenizer was therefore never extracted
- token_counter always fell back to OpenAI tokenizer
Expected behavior:
- When a model has custom_tokenizer in model_info
- The token_counter should use that custom tokenizer (intfloat/multilingual-e5-large-instruct)
- tokenizer_type should reflect "huggingface_tokenizer" not "openai_tokenizer"
"""
# Create a router with a model that has custom_tokenizer for multilingual embeddings
# This matches the user's real config with intfloat/multilingual-e5-large-instruct
llm_router = Router(
model_list=[
{
"model_name": "nikro-llama",
"litellm_params": {
"model": "openai/llama-3.1-8b-instant",
"api_base": "https://api.groq.com/openai/v1",
},
"model_info": {
"mode": "embedding",
"custom_tokenizer": {
"identifier": "intfloat/multilingual-e5-large-instruct",
"revision": "main",
"auth_token": None,
},
},
}
]
)
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
# Make a token counting request with a multilingual text sample
# This is realistic for the multilingual-e5 model
response = await token_counter(
request=TokenCountRequest(
model="nikro-llama",
messages=[
{"role": "user", "content": "Hello world! Bonjour le monde! 你好世界!"}
],
)
)
print("Response:", response)
print("Tokenizer type:", response.tokenizer_type)
print("Model used:", response.model_used)
print("Total tokens:", response.total_tokens)
# Verify that custom tokenizer (intfloat/multilingual-e5-large-instruct) was used
assert response.tokenizer_type == "huggingface_tokenizer", (
f"Expected 'huggingface_tokenizer' (intfloat/multilingual-e5-large-instruct) "
f"but got '{response.tokenizer_type}'. "
"This indicates the custom_tokenizer from model_info was not used."
)
assert response.request_model == "nikro-llama"
assert response.model_used == "llama-3.1-8b-instant"
assert response.total_tokens > 0
@pytest.mark.asyncio
async def test_custom_tokenizer_with_llamacpp():
"""
Test custom_tokenizer with llamacpp model (similar to user's setup).
This simulates the user's Docker environment where:
- They have a llamacpp model
- With custom_tokenizer configured
- In Docker, it was using OpenAI tokenizer (bug)
- Locally, it was using HuggingFace tokenizer (correct)
"""
llm_router = Router(
model_list=[
{
"model_name": "my-local-model",
"litellm_params": {
"model": "openai/my-local-llama",
"api_base": "http://localhost:8080/v1",
},
"model_info": {
"custom_tokenizer": {
"identifier": "intfloat/multilingual-e5-large-instruct",
"revision": "main",
"auth_token": None,
},
},
}
]
)
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
response = await token_counter(
request=TokenCountRequest(
model="my-local-model",
messages=[{"role": "user", "content": "test message"}],
)
)
# The bug would cause this to be "openai_tokenizer"
assert (
response.tokenizer_type == "huggingface_tokenizer"
), f"Custom tokenizer not used! Got: {response.tokenizer_type}"
@pytest.mark.asyncio
async def test_multilingual_e5_embedding_model():
"""
Test the exact real-world use case: intfloat/multilingual-e5-large-instruct
tokenizer with a custom embedding endpoint.
This is the user's actual production scenario:
- Custom embedding model endpoint (could be llama.cpp, vLLM, etc.)
- Using intfloat/multilingual-e5-large-instruct for tokenization
- Model served via OpenAI-compatible API
"""
llm_router = Router(
model_list=[
{
"model_name": "my-embedding-model",
"litellm_params": {
"model": "openai/custom-embedding-model",
"api_base": "http://localhost:8080/v1",
},
"model_info": {
"mode": "embedding",
"custom_tokenizer": {
"identifier": "intfloat/multilingual-e5-large-instruct",
"revision": "main",
"auth_token": None,
},
},
}
]
)
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
# Test with multilingual content (what e5-large-instruct is designed for)
response = await token_counter(
request=TokenCountRequest(
model="my-embedding-model",
messages=[
{
"role": "user",
"content": "This is a multilingual test. C'est un test multilingue. 这是一个多语言测试。",
}
],
)
)
print(
f"Embedding model test - Tokenizer: {response.tokenizer_type}, Tokens: {response.total_tokens}"
)
# Must use HuggingFace tokenizer with intfloat/multilingual-e5-large-instruct
assert response.tokenizer_type == "huggingface_tokenizer", (
f"The intfloat/multilingual-e5-large-instruct tokenizer was not used! "
f"Got: {response.tokenizer_type}"
)
assert response.total_tokens > 0
@pytest.mark.asyncio
async def test_model_without_custom_tokenizer_uses_default():
"""
Test that models without custom_tokenizer still work correctly.
"""
llm_router = Router(
model_list=[
{
"model_name": "gpt-4",
"litellm_params": {
"model": "gpt-4",
},
"model_info": {}, # No custom_tokenizer
}
]
)
setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
response = await token_counter(
request=TokenCountRequest(
model="gpt-4",
messages=[{"role": "user", "content": "hello"}],
)
)
# Should use OpenAI tokenizer for GPT-4
assert response.tokenizer_type == "openai_tokenizer"
assert response.model_used == "gpt-4"