fix(token-counter): extract model_info from deployment for custom_tokenizer (#15657) (#15680)

2026-07-01 20:44:04 -04:00 · 2025-10-18 04:38:45 +02:00
parent 21c3720732
commit 6842d705d5
2 changed files with 247 additions and 26 deletions
@@ -477,9 +477,9 @@ except ImportError:
 server_root_path = os.getenv("SERVER_ROOT_PATH", "")
 _license_check = LicenseCheck()
 premium_user: bool = _license_check.is_premium()
-premium_user_data: Optional["EnterpriseLicenseData"] = (
-    _license_check.airgapped_license_data
-)
+premium_user_data: Optional[
+    "EnterpriseLicenseData"
+] = _license_check.airgapped_license_data
 global_max_parallel_request_retries_env: Optional[str] = os.getenv(
    "LITELLM_GLOBAL_MAX_PARALLEL_REQUEST_RETRIES"
 )
@@ -1006,9 +1006,9 @@ worker_config = None
 master_key: Optional[str] = None
 otel_logging = False
 prisma_client: Optional[PrismaClient] = None
-shared_aiohttp_session: Optional["ClientSession"] = (
-    None  # Global shared session for connection reuse
-)
+shared_aiohttp_session: Optional[
+    "ClientSession"
+] = None  # Global shared session for connection reuse
 user_api_key_cache = DualCache(
    default_in_memory_ttl=UserAPIKeyCacheTTLEnum.in_memory_cache_ttl.value
 )
@@ -1016,9 +1016,9 @@ model_max_budget_limiter = _PROXY_VirtualKeyModelMaxBudgetLimiter(
    dual_cache=user_api_key_cache
 )
 litellm.logging_callback_manager.add_litellm_callback(model_max_budget_limiter)
-redis_usage_cache: Optional[RedisCache] = (
-    None  # redis cache used for tracking spend, tpm/rpm limits
-)
+redis_usage_cache: Optional[
+    RedisCache
+] = None  # redis cache used for tracking spend, tpm/rpm limits
 user_custom_auth = None
 user_custom_key_generate = None
 user_custom_sso = None
@@ -1351,9 +1351,9 @@ async def update_cache(  # noqa: PLR0915
        _id = "team_id:{}".format(team_id)
        try:
            # Fetch the existing cost for the given user
-            existing_spend_obj: Optional[LiteLLM_TeamTable] = (
-                await user_api_key_cache.async_get_cache(key=_id)
-            )
+            existing_spend_obj: Optional[
+                LiteLLM_TeamTable
+            ] = await user_api_key_cache.async_get_cache(key=_id)
            if existing_spend_obj is None:
                # do nothing if team not in api key cache
                return
@@ -1518,10 +1518,11 @@ async def _run_background_health_check():

        if shared_health_manager is not None:
            try:
-                healthy_endpoints, unhealthy_endpoints = (
-                    await shared_health_manager.perform_shared_health_check(
-                        model_list=_llm_model_list, details=details_bool
-                    )
+                (
+                    healthy_endpoints,
+                    unhealthy_endpoints,
+                ) = await shared_health_manager.perform_shared_health_check(
+                    model_list=_llm_model_list, details=details_bool
                )
            except Exception as e:
                verbose_proxy_logger.error(
@@ -3370,10 +3371,10 @@ class ProxyConfig:
        )

        try:
-            guardrails_in_db: List[Guardrail] = (
-                await GuardrailRegistry.get_all_guardrails_from_db(
-                    prisma_client=prisma_client
-                )
+            guardrails_in_db: List[
+                Guardrail
+            ] = await GuardrailRegistry.get_all_guardrails_from_db(
+                prisma_client=prisma_client
            )
            verbose_proxy_logger.debug(
                "guardrails from the DB %s", str(guardrails_in_db)
@@ -3603,9 +3604,9 @@ async def initialize(  # noqa: PLR0915
        user_api_base = api_base
        dynamic_config[user_model]["api_base"] = api_base
    if api_version:
-        os.environ["AZURE_API_VERSION"] = (
-            api_version  # set this for azure - litellm can read this from the env
-        )
+        os.environ[
+            "AZURE_API_VERSION"
+        ] = api_version  # set this for azure - litellm can read this from the env
    if max_tokens:  # model-specific param
        dynamic_config[user_model]["max_tokens"] = max_tokens
    if temperature:  # model-specific param
@@ -6241,6 +6242,7 @@ async def token_counter(request: TokenCountRequest, call_endpoint: bool = False)
            pass
    if deployment is not None:
        litellm_model_name = deployment.get("litellm_params", {}).get("model")
+        model_info = deployment.get("model_info", {})
        load_credentials_from_list(deployment.get("litellm_params", {}))
        # remove the custom_llm_provider_prefix in the litellm_model_name
        if "/" in litellm_model_name:
@@ -8968,9 +8970,9 @@ async def get_config_list(
                            hasattr(sub_field_info, "description")
                            and sub_field_info.description is not None
                        ):
-                            nested_fields[idx].field_description = (
-                                sub_field_info.description
-                            )
+                            nested_fields[
+                                idx
+                            ].field_description = sub_field_info.description
                        idx += 1

                    _stored_in_db = None
@@ -0,0 +1,219 @@
+"""
+Test for custom_tokenizer bug fix.
+Issue: custom_tokenizer from model_info was not being extracted from deployment,
+causing token_counter to always use OpenAI tokenizer instead of the configured custom tokenizer.
+"""
+
+import pytest
+import litellm
+import litellm.proxy.proxy_server
+from litellm.proxy.proxy_server import token_counter
+from litellm.proxy._types import TokenCountRequest
+from litellm import Router
+
+
+@pytest.mark.asyncio
+async def test_custom_tokenizer_from_model_info():
+    """
+    Test that custom_tokenizer from model_info is correctly used for token counting.
+
+    Real-world scenario: Using intfloat/multilingual-e5-large-instruct tokenizer
+    for a custom embedding model (like Groq-hosted llama model used for embeddings).
+
+    This test reproduces the bug where:
+    - model_info was declared but never populated from deployment
+    - custom_tokenizer was therefore never extracted
+    - token_counter always fell back to OpenAI tokenizer
+
+    Expected behavior:
+    - When a model has custom_tokenizer in model_info
+    - The token_counter should use that custom tokenizer (intfloat/multilingual-e5-large-instruct)
+    - tokenizer_type should reflect "huggingface_tokenizer" not "openai_tokenizer"
+    """
+
+    # Create a router with a model that has custom_tokenizer for multilingual embeddings
+    # This matches the user's real config with intfloat/multilingual-e5-large-instruct
+    llm_router = Router(
+        model_list=[
+            {
+                "model_name": "nikro-llama",
+                "litellm_params": {
+                    "model": "openai/llama-3.1-8b-instant",
+                    "api_base": "https://api.groq.com/openai/v1",
+                },
+                "model_info": {
+                    "mode": "embedding",
+                    "custom_tokenizer": {
+                        "identifier": "intfloat/multilingual-e5-large-instruct",
+                        "revision": "main",
+                        "auth_token": None,
+                    },
+                },
+            }
+        ]
+    )
+
+    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
+
+    # Make a token counting request with a multilingual text sample
+    # This is realistic for the multilingual-e5 model
+    response = await token_counter(
+        request=TokenCountRequest(
+            model="nikro-llama",
+            messages=[
+                {"role": "user", "content": "Hello world! Bonjour le monde! 你好世界!"}
+            ],
+        )
+    )
+
+    print("Response:", response)
+    print("Tokenizer type:", response.tokenizer_type)
+    print("Model used:", response.model_used)
+    print("Total tokens:", response.total_tokens)
+
+    # Verify that custom tokenizer (intfloat/multilingual-e5-large-instruct) was used
+    assert response.tokenizer_type == "huggingface_tokenizer", (
+        f"Expected 'huggingface_tokenizer' (intfloat/multilingual-e5-large-instruct) "
+        f"but got '{response.tokenizer_type}'. "
+        "This indicates the custom_tokenizer from model_info was not used."
+    )
+    assert response.request_model == "nikro-llama"
+    assert response.model_used == "llama-3.1-8b-instant"
+    assert response.total_tokens > 0
+
+
+@pytest.mark.asyncio
+async def test_custom_tokenizer_with_llamacpp():
+    """
+    Test custom_tokenizer with llamacpp model (similar to user's setup).
+
+    This simulates the user's Docker environment where:
+    - They have a llamacpp model
+    - With custom_tokenizer configured
+    - In Docker, it was using OpenAI tokenizer (bug)
+    - Locally, it was using HuggingFace tokenizer (correct)
+    """
+
+    llm_router = Router(
+        model_list=[
+            {
+                "model_name": "my-local-model",
+                "litellm_params": {
+                    "model": "openai/my-local-llama",
+                    "api_base": "http://localhost:8080/v1",
+                },
+                "model_info": {
+                    "custom_tokenizer": {
+                        "identifier": "intfloat/multilingual-e5-large-instruct",
+                        "revision": "main",
+                        "auth_token": None,
+                    },
+                },
+            }
+        ]
+    )
+
+    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
+
+    response = await token_counter(
+        request=TokenCountRequest(
+            model="my-local-model",
+            messages=[{"role": "user", "content": "test message"}],
+        )
+    )
+
+    # The bug would cause this to be "openai_tokenizer"
+    assert (
+        response.tokenizer_type == "huggingface_tokenizer"
+    ), f"Custom tokenizer not used! Got: {response.tokenizer_type}"
+
+
+@pytest.mark.asyncio
+async def test_multilingual_e5_embedding_model():
+    """
+    Test the exact real-world use case: intfloat/multilingual-e5-large-instruct
+    tokenizer with a custom embedding endpoint.
+
+    This is the user's actual production scenario:
+    - Custom embedding model endpoint (could be llama.cpp, vLLM, etc.)
+    - Using intfloat/multilingual-e5-large-instruct for tokenization
+    - Model served via OpenAI-compatible API
+    """
+
+    llm_router = Router(
+        model_list=[
+            {
+                "model_name": "my-embedding-model",
+                "litellm_params": {
+                    "model": "openai/custom-embedding-model",
+                    "api_base": "http://localhost:8080/v1",
+                },
+                "model_info": {
+                    "mode": "embedding",
+                    "custom_tokenizer": {
+                        "identifier": "intfloat/multilingual-e5-large-instruct",
+                        "revision": "main",
+                        "auth_token": None,
+                    },
+                },
+            }
+        ]
+    )
+
+    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
+
+    # Test with multilingual content (what e5-large-instruct is designed for)
+    response = await token_counter(
+        request=TokenCountRequest(
+            model="my-embedding-model",
+            messages=[
+                {
+                    "role": "user",
+                    "content": "This is a multilingual test. C'est un test multilingue. 这是一个多语言测试。",
+                }
+            ],
+        )
+    )
+
+    print(
+        f"Embedding model test - Tokenizer: {response.tokenizer_type}, Tokens: {response.total_tokens}"
+    )
+
+    # Must use HuggingFace tokenizer with intfloat/multilingual-e5-large-instruct
+    assert response.tokenizer_type == "huggingface_tokenizer", (
+        f"The intfloat/multilingual-e5-large-instruct tokenizer was not used! "
+        f"Got: {response.tokenizer_type}"
+    )
+    assert response.total_tokens > 0
+
+
+@pytest.mark.asyncio
+async def test_model_without_custom_tokenizer_uses_default():
+    """
+    Test that models without custom_tokenizer still work correctly.
+    """
+
+    llm_router = Router(
+        model_list=[
+            {
+                "model_name": "gpt-4",
+                "litellm_params": {
+                    "model": "gpt-4",
+                },
+                "model_info": {},  # No custom_tokenizer
+            }
+        ]
+    )
+
+    setattr(litellm.proxy.proxy_server, "llm_router", llm_router)
+
+    response = await token_counter(
+        request=TokenCountRequest(
+            model="gpt-4",
+            messages=[{"role": "user", "content": "hello"}],
+        )
+    )
+
+    # Should use OpenAI tokenizer for GPT-4
+    assert response.tokenizer_type == "openai_tokenizer"
+    assert response.model_used == "gpt-4"