mirror of
https://github.com/onyx-dot-app/litellm.git
synced 2026-07-01 20:44:04 -04:00
[Feat] Add Azure AVA TTS integration (#15749)
* add AzureBaseIssueTokenHandler * add BaseTextToSpeechConfig * async_text_to_speech_handler * add AzureAVATextToSpeechConfig * add get_provider_text_to_speech_config * add AzureAVATextToSpeechConfig * fixes for base_llm_http_handler * fix transform_text_to_speech_request * test_azure_ava_tts_async * test_azure_ava_tts_async * fix TextToSpeechRequestData * fix transform_text_to_speech_request * add text_to_speech_handler in LLMHttpHandler * remove old file * fix transform_text_to_speech_request * fix dispatch_text_to_speech * fix azure TTS * fix AVA TTS * fix transform * fix linting * ci/cd - use one job for audio testing * fix tests * fix llm http handler debugging * unit tests azure tts * docs Azure speech * docs fix * docs azure AVA * docs azure AVA * fix handlers * test_async_realtime_uses_max_size_parameter
This commit is contained in:
+52
-1
@@ -1419,6 +1419,49 @@ jobs:
|
||||
paths:
|
||||
- logging_coverage.xml
|
||||
- logging_coverage
|
||||
audio_testing:
|
||||
docker:
|
||||
- image: cimg/python:3.11
|
||||
auth:
|
||||
username: ${DOCKERHUB_USERNAME}
|
||||
password: ${DOCKERHUB_PASSWORD}
|
||||
working_directory: ~/project
|
||||
|
||||
steps:
|
||||
- checkout
|
||||
- setup_google_dns
|
||||
- run:
|
||||
name: Install Dependencies
|
||||
command: |
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install -r requirements.txt
|
||||
pip install "pytest==7.3.1"
|
||||
pip install "pytest-retry==1.6.3"
|
||||
pip install "pytest-cov==5.0.0"
|
||||
pip install "pytest-asyncio==0.21.1"
|
||||
pip install "respx==0.22.0"
|
||||
# Run pytest and generate JUnit XML report
|
||||
- run:
|
||||
name: Run tests
|
||||
command: |
|
||||
pwd
|
||||
ls
|
||||
python -m pytest -vv tests/audio_tests --cov=litellm --cov-report=xml -x -s -v --junitxml=test-results/junit.xml --durations=5
|
||||
no_output_timeout: 120m
|
||||
- run:
|
||||
name: Rename the coverage files
|
||||
command: |
|
||||
mv coverage.xml audio_coverage.xml
|
||||
mv .coverage audio_coverage
|
||||
|
||||
# Store test results
|
||||
- store_test_results:
|
||||
path: test-results
|
||||
- persist_to_workspace:
|
||||
root: .
|
||||
paths:
|
||||
- audio_coverage.xml
|
||||
- audio_coverage
|
||||
installing_litellm_on_python:
|
||||
docker:
|
||||
- image: circleci/python:3.8
|
||||
@@ -2784,7 +2827,7 @@ jobs:
|
||||
python -m venv venv
|
||||
. venv/bin/activate
|
||||
pip install coverage
|
||||
coverage combine llm_translation_coverage llm_responses_api_coverage ocr_coverage mcp_coverage logging_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_security_tests_coverage guardrails_coverage
|
||||
coverage combine llm_translation_coverage llm_responses_api_coverage ocr_coverage mcp_coverage logging_coverage audio_coverage litellm_router_coverage local_testing_coverage litellm_assistants_api_coverage auth_ui_unit_tests_coverage langfuse_coverage caching_coverage litellm_proxy_unit_tests_coverage image_gen_coverage pass_through_unit_tests_coverage batches_coverage litellm_security_tests_coverage guardrails_coverage
|
||||
coverage xml
|
||||
- codecov/upload:
|
||||
file: ./coverage.xml
|
||||
@@ -3380,6 +3423,12 @@ workflows:
|
||||
only:
|
||||
- main
|
||||
- /litellm_.*/
|
||||
- audio_testing:
|
||||
filters:
|
||||
branches:
|
||||
only:
|
||||
- main
|
||||
- /litellm_.*/
|
||||
- upload-coverage:
|
||||
requires:
|
||||
- llm_translation_testing
|
||||
@@ -3395,6 +3444,7 @@ workflows:
|
||||
- pass_through_unit_testing
|
||||
- image_gen_testing
|
||||
- logging_testing
|
||||
- audio_testing
|
||||
- litellm_router_testing
|
||||
- litellm_router_unit_testing
|
||||
- caching_unit_tests
|
||||
@@ -3458,6 +3508,7 @@ workflows:
|
||||
- pass_through_unit_testing
|
||||
- image_gen_testing
|
||||
- logging_testing
|
||||
- audio_testing
|
||||
- litellm_router_testing
|
||||
- litellm_router_unit_testing
|
||||
- caching_unit_tests
|
||||
|
||||
@@ -11,7 +11,7 @@ import TabItem from '@theme/TabItem';
|
||||
|-------|-------|
|
||||
| Description | Azure OpenAI Service provides REST API access to OpenAI's powerful language models including o1, o1-mini, GPT-5, GPT-4o, GPT-4o mini, GPT-4 Turbo with Vision, GPT-4, GPT-3.5-Turbo, and Embeddings model series |
|
||||
| Provider Route on LiteLLM | `azure/`, [`azure/o_series/`](#o-series-models), [`azure/gpt5_series/`](#gpt-5-models) |
|
||||
| Supported Operations | [`/chat/completions`](#azure-openai-chat-completion-models), [`/responses`](./azure_responses), [`/completions`](#azure-instruct-models), [`/embeddings`](./azure_embedding), [`/audio/speech`](#azure-text-to-speech-tts), [`/audio/transcriptions`](../audio_transcription), `/fine_tuning`, [`/batches`](#azure-batches-api), `/files`, [`/images`](../image_generation#azure-openai-image-generation-models) |
|
||||
| Supported Operations | [`/chat/completions`](#azure-openai-chat-completion-models), [`/responses`](./azure_responses), [`/completions`](#azure-instruct-models), [`/embeddings`](./azure_embedding), [`/audio/speech`](azure_speech), [`/audio/transcriptions`](../audio_transcription), `/fine_tuning`, [`/batches`](#azure-batches-api), `/files`, [`/images`](../image_generation#azure-openai-image-generation-models) |
|
||||
| Link to Provider Doc | [Azure OpenAI ↗](https://learn.microsoft.com/en-us/azure/ai-services/openai/overview)
|
||||
|
||||
## API Keys, Params
|
||||
@@ -538,39 +538,6 @@ response = litellm.completion(
|
||||
print(response)
|
||||
```
|
||||
|
||||
## Azure Text to Speech (tts)
|
||||
|
||||
**LiteLLM PROXY**
|
||||
|
||||
```yaml
|
||||
- model_name: azure/tts-1
|
||||
litellm_params:
|
||||
model: azure/tts-1
|
||||
api_base: "os.environ/AZURE_API_BASE_TTS"
|
||||
api_key: "os.environ/AZURE_API_KEY_TTS"
|
||||
api_version: "os.environ/AZURE_API_VERSION"
|
||||
```
|
||||
|
||||
**LiteLLM SDK**
|
||||
|
||||
```python
|
||||
from litellm import completion
|
||||
|
||||
## set ENV variables
|
||||
os.environ["AZURE_API_KEY"] = ""
|
||||
os.environ["AZURE_API_BASE"] = ""
|
||||
os.environ["AZURE_API_VERSION"] = ""
|
||||
|
||||
# azure call
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response = speech(
|
||||
model="azure/<your-deployment-name",
|
||||
voice="alloy",
|
||||
input="the quick brown fox jumped over the lazy dogs",
|
||||
)
|
||||
response.stream_to_file(speech_file_path)
|
||||
```
|
||||
|
||||
## **Authentication**
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,68 @@
|
||||
# Azure Text to Speech (tts)
|
||||
|
||||
Convert text to natural-sounding speech using Azure OpenAI's Text to Speech models. Supports multiple voices and audio formats.
|
||||
|
||||
## Quick Start
|
||||
|
||||
**LiteLLM SDK**
|
||||
|
||||
```python showLineNumbers title="SDK Usage"
|
||||
from litellm import speech
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
## set ENV variables
|
||||
os.environ["AZURE_API_KEY"] = ""
|
||||
os.environ["AZURE_API_BASE"] = ""
|
||||
os.environ["AZURE_API_VERSION"] = ""
|
||||
|
||||
# azure call
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response = speech(
|
||||
model="azure/<your-deployment-name>",
|
||||
voice="alloy",
|
||||
input="the quick brown fox jumped over the lazy dogs",
|
||||
)
|
||||
response.stream_to_file(speech_file_path)
|
||||
```
|
||||
|
||||
**LiteLLM PROXY**
|
||||
|
||||
```yaml showLineNumbers title="proxy_config.yaml"
|
||||
model_list:
|
||||
- model_name: azure/tts-1
|
||||
litellm_params:
|
||||
model: azure/tts-1
|
||||
api_base: "os.environ/AZURE_API_BASE_TTS"
|
||||
api_key: "os.environ/AZURE_API_KEY_TTS"
|
||||
api_version: "os.environ/AZURE_API_VERSION"
|
||||
```
|
||||
|
||||
## Available Voices
|
||||
|
||||
Azure OpenAI supports the following voices:
|
||||
- `alloy` - Neutral and balanced
|
||||
- `echo` - Warm and upbeat
|
||||
- `fable` - Expressive and dramatic
|
||||
- `onyx` - Deep and authoritative
|
||||
- `nova` - Friendly and conversational
|
||||
- `shimmer` - Bright and cheerful
|
||||
|
||||
## Supported Parameters
|
||||
|
||||
```python showLineNumbers title="All Parameters"
|
||||
response = speech(
|
||||
model="azure/<your-deployment-name>",
|
||||
voice="alloy", # Required: Voice selection
|
||||
input="text to convert", # Required: Input text
|
||||
speed=1.0, # Optional: 0.25 to 4.0 (default: 1.0)
|
||||
response_format="mp3" # Optional: mp3, opus, aac, flac, wav, pcm
|
||||
)
|
||||
```
|
||||
|
||||
## Supported Models
|
||||
|
||||
- `tts-1` - Standard quality, optimized for speed
|
||||
- `tts-1-hd` - High definition, optimized for quality
|
||||
|
||||
Use your Azure deployment name: `azure/<your-deployment-name>`
|
||||
@@ -0,0 +1,166 @@
|
||||
# Azure AI Speech (Cognitive Services)
|
||||
|
||||
Azure AI Speech is Azure's Cognitive Services text-to-speech API, separate from Azure OpenAI. It provides high-quality neural voices with broader language support and advanced speech customization.
|
||||
|
||||
**When to use this vs Azure OpenAI TTS:**
|
||||
- **Azure AI Speech** - More languages, neural voices, SSML support, speech customization
|
||||
- **Azure OpenAI TTS** - OpenAI models, integrated with Azure OpenAI services
|
||||
|
||||
|
||||
## Overview
|
||||
|
||||
| Property | Details |
|
||||
|-------|-------|
|
||||
| Description | Azure AI Speech is Azure's Cognitive Services text-to-speech API, separate from Azure OpenAI. It provides high-quality neural voices with broader language support and advanced speech customization. |
|
||||
| Provider Route on LiteLLM | `azure/speech/` |
|
||||
|
||||
## Quick Start
|
||||
|
||||
**LiteLLM SDK**
|
||||
|
||||
```python showLineNumbers title="SDK Usage"
|
||||
from litellm import speech
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
os.environ["AZURE_TTS_API_KEY"] = "your-cognitive-services-key"
|
||||
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response = speech(
|
||||
model="azure/speech/azure-tts",
|
||||
voice="alloy",
|
||||
input="Hello, this is Azure AI Speech",
|
||||
api_base="https://eastus.tts.speech.microsoft.com",
|
||||
api_key=os.environ["AZURE_TTS_API_KEY"],
|
||||
)
|
||||
response.stream_to_file(speech_file_path)
|
||||
```
|
||||
|
||||
**LiteLLM Proxy**
|
||||
|
||||
```yaml showLineNumbers title="proxy_config.yaml"
|
||||
model_list:
|
||||
- model_name: azure-speech
|
||||
litellm_params:
|
||||
model: azure/speech/azure-tts
|
||||
api_base: https://eastus.tts.speech.microsoft.com
|
||||
api_key: os.environ/AZURE_TTS_API_KEY
|
||||
```
|
||||
|
||||
## Setup
|
||||
|
||||
1. Create an Azure Cognitive Services resource in the [Azure Portal](https://portal.azure.com)
|
||||
2. Get your API key from the resource
|
||||
3. Note your region (e.g., `eastus`, `westus`, `westeurope`)
|
||||
4. Use the regional endpoint: `https://{region}.tts.speech.microsoft.com`
|
||||
|
||||
## Voice Mapping
|
||||
|
||||
LiteLLM automatically maps OpenAI voice names to Azure Neural voices:
|
||||
|
||||
| OpenAI Voice | Azure Neural Voice | Description |
|
||||
|-------------|-------------------|-------------|
|
||||
| `alloy` | en-US-JennyNeural | Neutral and balanced |
|
||||
| `echo` | en-US-GuyNeural | Warm and upbeat |
|
||||
| `fable` | en-GB-RyanNeural | Expressive and dramatic |
|
||||
| `onyx` | en-US-DavisNeural | Deep and authoritative |
|
||||
| `nova` | en-US-AmberNeural | Friendly and conversational |
|
||||
| `shimmer` | en-US-AriaNeural | Bright and cheerful |
|
||||
|
||||
## Supported Parameters
|
||||
|
||||
```python showLineNumbers title="All Parameters"
|
||||
response = speech(
|
||||
model="azure/speech/azure-tts",
|
||||
voice="alloy", # Required: Voice selection
|
||||
input="text to convert", # Required: Input text
|
||||
speed=1.0, # Optional: 0.25 to 4.0 (default: 1.0)
|
||||
response_format="mp3", # Optional: mp3, opus, wav, pcm
|
||||
api_base="https://eastus.tts.speech.microsoft.com",
|
||||
api_key="your-key",
|
||||
)
|
||||
```
|
||||
|
||||
### Response Formats
|
||||
|
||||
| Format | Azure Output Format | Sample Rate |
|
||||
|--------|-------------------|-------------|
|
||||
| `mp3` | audio-24khz-48kbitrate-mono-mp3 | 24kHz |
|
||||
| `opus` | ogg-48khz-16bit-mono-opus | 48kHz |
|
||||
| `wav` | riff-24khz-16bit-mono-pcm | 24kHz |
|
||||
| `pcm` | raw-24khz-16bit-mono-pcm | 24kHz |
|
||||
|
||||
## Async Support
|
||||
|
||||
```python showLineNumbers title="Async Usage"
|
||||
import asyncio
|
||||
from litellm import aspeech
|
||||
from pathlib import Path
|
||||
|
||||
async def generate_speech():
|
||||
response = await aspeech(
|
||||
model="azure/speech/azure-tts",
|
||||
voice="alloy",
|
||||
input="Hello from async",
|
||||
api_base="https://eastus.tts.speech.microsoft.com",
|
||||
api_key=os.environ["AZURE_TTS_API_KEY"],
|
||||
)
|
||||
|
||||
speech_file_path = Path(__file__).parent / "speech.mp3"
|
||||
response.stream_to_file(speech_file_path)
|
||||
|
||||
asyncio.run(generate_speech())
|
||||
```
|
||||
|
||||
## Regional Endpoints
|
||||
|
||||
Replace `{region}` with your Azure resource region:
|
||||
|
||||
- US East: `https://eastus.tts.speech.microsoft.com`
|
||||
- US West: `https://westus.tts.speech.microsoft.com`
|
||||
- Europe West: `https://westeurope.tts.speech.microsoft.com`
|
||||
- Asia Southeast: `https://southeastasia.tts.speech.microsoft.com`
|
||||
|
||||
[Full list of regions](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/regions)
|
||||
|
||||
## Advanced Features
|
||||
|
||||
### Custom Neural Voices
|
||||
|
||||
You can use any Azure Neural voice by passing the full voice name:
|
||||
|
||||
```python showLineNumbers title="Custom Voice"
|
||||
response = speech(
|
||||
model="azure/speech/azure-tts",
|
||||
voice="en-US-AriaNeural", # Direct Azure voice name
|
||||
input="Using a specific neural voice",
|
||||
api_base="https://eastus.tts.speech.microsoft.com",
|
||||
api_key=os.environ["AZURE_TTS_API_KEY"],
|
||||
)
|
||||
```
|
||||
|
||||
Browse available voices in the [Azure Speech Gallery](https://speech.microsoft.com/portal/voicegallery).
|
||||
|
||||
## Error Handling
|
||||
|
||||
```python showLineNumbers title="Error Handling"
|
||||
from litellm import speech
|
||||
from litellm.exceptions import APIError
|
||||
|
||||
try:
|
||||
response = speech(
|
||||
model="azure/speech/azure-tts",
|
||||
voice="alloy",
|
||||
input="Test message",
|
||||
api_base="https://eastus.tts.speech.microsoft.com",
|
||||
api_key=os.environ["AZURE_TTS_API_KEY"],
|
||||
)
|
||||
except APIError as e:
|
||||
print(f"Azure Speech error: {e}")
|
||||
```
|
||||
|
||||
## Reference
|
||||
|
||||
- [Azure Speech Service Documentation](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/)
|
||||
- [Text-to-Speech REST API](https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech)
|
||||
|
||||
@@ -418,6 +418,7 @@ const sidebars = {
|
||||
"providers/azure/azure",
|
||||
"providers/azure/azure_responses",
|
||||
"providers/azure/azure_embedding",
|
||||
"providers/azure/azure_speech",
|
||||
]
|
||||
},
|
||||
{
|
||||
@@ -425,6 +426,7 @@ const sidebars = {
|
||||
label: "Azure AI",
|
||||
items: [
|
||||
"providers/azure_ai",
|
||||
"providers/azure_ai_speech",
|
||||
"providers/azure_ai_img",
|
||||
]
|
||||
},
|
||||
|
||||
@@ -94,9 +94,12 @@ AIOHTTP_KEEPALIVE_TIMEOUT = int(os.getenv("AIOHTTP_KEEPALIVE_TIMEOUT", 120))
|
||||
AIOHTTP_TTL_DNS_CACHE = int(os.getenv("AIOHTTP_TTL_DNS_CACHE", 300))
|
||||
|
||||
# WebSocket constants
|
||||
REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES = int(
|
||||
os.getenv("REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES", 10 * 1024 * 1024)
|
||||
) # 10MB default to handle large base64 audio payloads from realtime APIs
|
||||
# Default to None (unlimited) to match OpenAI's official agents SDK behavior
|
||||
# https://github.com/openai/openai-agents-python/blob/cf1b933660e44fd37b4350c41febab8221801409/src/agents/realtime/openai_realtime.py#L235
|
||||
_max_size_env = os.getenv("REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES")
|
||||
REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES = (
|
||||
int(_max_size_env) if _max_size_env is not None else None
|
||||
)
|
||||
|
||||
# SSL/TLS cipher configuration for faster handshakes
|
||||
# Strategy: Strongly prefer fast modern ciphers, but allow fallback to commonly supported ones
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
"""Azure Text-to-Speech module"""
|
||||
|
||||
from .transformation import AzureAVATextToSpeechConfig
|
||||
|
||||
__all__ = [
|
||||
"AzureAVATextToSpeechConfig",
|
||||
]
|
||||
|
||||
@@ -0,0 +1,373 @@
|
||||
"""
|
||||
Azure AVA (Cognitive Services) Text-to-Speech transformation
|
||||
|
||||
Maps OpenAI TTS spec to Azure Cognitive Services TTS API
|
||||
"""
|
||||
|
||||
from typing import TYPE_CHECKING, Any, Coroutine, Dict, Optional, Union
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
import litellm
|
||||
from litellm.llms.base_llm.text_to_speech.transformation import (
|
||||
BaseTextToSpeechConfig,
|
||||
TextToSpeechRequestData,
|
||||
)
|
||||
from litellm.secret_managers.main import get_secret_str
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as LiteLLMLoggingObj
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
else:
|
||||
LiteLLMLoggingObj = Any
|
||||
HttpxBinaryResponseContent = Any
|
||||
|
||||
|
||||
class AzureAVATextToSpeechConfig(BaseTextToSpeechConfig):
|
||||
"""
|
||||
Configuration for Azure AVA (Cognitive Services) Text-to-Speech
|
||||
|
||||
Reference: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/rest-text-to-speech
|
||||
"""
|
||||
|
||||
# Azure endpoint domains
|
||||
COGNITIVE_SERVICES_DOMAIN = "api.cognitive.microsoft.com"
|
||||
TTS_SPEECH_DOMAIN = "tts.speech.microsoft.com"
|
||||
TTS_ENDPOINT_PATH = "/cognitiveservices/v1"
|
||||
|
||||
# Voice name mappings from OpenAI voices to Azure voices
|
||||
VOICE_MAPPINGS = {
|
||||
"alloy": "en-US-JennyNeural",
|
||||
"echo": "en-US-GuyNeural",
|
||||
"fable": "en-GB-RyanNeural",
|
||||
"onyx": "en-US-DavisNeural",
|
||||
"nova": "en-US-AmberNeural",
|
||||
"shimmer": "en-US-AriaNeural",
|
||||
}
|
||||
|
||||
# Response format mappings from OpenAI to Azure
|
||||
FORMAT_MAPPINGS = {
|
||||
"mp3": "audio-24khz-48kbitrate-mono-mp3",
|
||||
"opus": "ogg-48khz-16bit-mono-opus",
|
||||
"aac": "audio-24khz-48kbitrate-mono-mp3", # Azure doesn't have AAC, use MP3
|
||||
"flac": "audio-24khz-48kbitrate-mono-mp3", # Azure doesn't have FLAC, use MP3
|
||||
"wav": "riff-24khz-16bit-mono-pcm",
|
||||
"pcm": "raw-24khz-16bit-mono-pcm",
|
||||
}
|
||||
|
||||
def dispatch_text_to_speech(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[Union[str, Dict]],
|
||||
optional_params: Dict,
|
||||
litellm_params_dict: Dict,
|
||||
logging_obj: "LiteLLMLoggingObj",
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
extra_headers: Optional[Dict[str, Any]],
|
||||
base_llm_http_handler: Any,
|
||||
aspeech: bool,
|
||||
api_base: Optional[str],
|
||||
api_key: Optional[str],
|
||||
**kwargs: Any,
|
||||
) -> Union[
|
||||
"HttpxBinaryResponseContent",
|
||||
Coroutine[Any, Any, "HttpxBinaryResponseContent"],
|
||||
]:
|
||||
"""
|
||||
Dispatch method to handle Azure AVA TTS requests
|
||||
|
||||
This method encapsulates Azure-specific credential resolution and parameter handling
|
||||
|
||||
Args:
|
||||
base_llm_http_handler: The BaseLLMHTTPHandler instance from main.py
|
||||
"""
|
||||
# Resolve api_base from multiple sources
|
||||
api_base = (
|
||||
api_base
|
||||
or litellm_params_dict.get("api_base")
|
||||
or litellm.api_base
|
||||
or get_secret_str("AZURE_API_BASE")
|
||||
)
|
||||
|
||||
# Resolve api_key from multiple sources (Azure-specific)
|
||||
api_key = (
|
||||
api_key
|
||||
or litellm_params_dict.get("api_key")
|
||||
or litellm.api_key
|
||||
or litellm.azure_key
|
||||
or get_secret_str("AZURE_OPENAI_API_KEY")
|
||||
or get_secret_str("AZURE_API_KEY")
|
||||
)
|
||||
|
||||
# Convert voice to string if it's a dict (for Azure AVA, voice must be a string)
|
||||
voice_str: Optional[str] = None
|
||||
if isinstance(voice, str):
|
||||
voice_str = voice
|
||||
elif isinstance(voice, dict):
|
||||
# Extract voice name from dict if needed
|
||||
voice_str = voice.get("name") if voice else None
|
||||
|
||||
litellm_params_dict.update({
|
||||
"api_key": api_key,
|
||||
"api_base": api_base,
|
||||
})
|
||||
# Call the text_to_speech_handler
|
||||
response = base_llm_http_handler.text_to_speech_handler(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice_str,
|
||||
text_to_speech_provider_config=self,
|
||||
text_to_speech_optional_params=optional_params,
|
||||
custom_llm_provider="azure",
|
||||
litellm_params=litellm_params_dict,
|
||||
logging_obj=logging_obj,
|
||||
timeout=timeout,
|
||||
extra_headers=extra_headers,
|
||||
client=None,
|
||||
_is_async=aspeech,
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
def get_supported_openai_params(self, model: str) -> list:
|
||||
"""
|
||||
Azure AVA TTS supports these OpenAI parameters
|
||||
"""
|
||||
return ["voice", "response_format", "speed"]
|
||||
|
||||
def _convert_speed_to_azure_rate(self, speed: float) -> str:
|
||||
"""
|
||||
Convert OpenAI speed value to Azure SSML prosody rate percentage
|
||||
|
||||
Args:
|
||||
speed: OpenAI speed value (0.25-4.0, default 1.0)
|
||||
|
||||
Returns:
|
||||
Azure rate string with percentage (e.g., "+50%", "-50%", "+0%")
|
||||
|
||||
Examples:
|
||||
speed=1.0 -> "+0%" (default)
|
||||
speed=2.0 -> "+100%"
|
||||
speed=0.5 -> "-50%"
|
||||
"""
|
||||
rate_percentage = int((speed - 1.0) * 100)
|
||||
return f"{rate_percentage:+d}%"
|
||||
|
||||
def map_openai_params(
|
||||
self,
|
||||
model: str,
|
||||
optional_params: Dict,
|
||||
drop_params: bool,
|
||||
) -> Dict:
|
||||
"""
|
||||
Map OpenAI parameters to Azure AVA TTS parameters
|
||||
"""
|
||||
mapped_params = {}
|
||||
|
||||
# Map voice
|
||||
if "voice" in optional_params:
|
||||
voice = optional_params["voice"]
|
||||
# If it's already an Azure voice, use it directly
|
||||
if isinstance(voice, str):
|
||||
if voice in self.VOICE_MAPPINGS:
|
||||
mapped_params["voice"] = self.VOICE_MAPPINGS[voice]
|
||||
else:
|
||||
# Assume it's already an Azure voice name
|
||||
mapped_params["voice"] = voice
|
||||
|
||||
# Map response format
|
||||
if "response_format" in optional_params:
|
||||
format_name = optional_params["response_format"]
|
||||
if format_name in self.FORMAT_MAPPINGS:
|
||||
mapped_params["output_format"] = self.FORMAT_MAPPINGS[format_name]
|
||||
else:
|
||||
# Try to use it directly as Azure format
|
||||
mapped_params["output_format"] = format_name
|
||||
else:
|
||||
# Default to MP3
|
||||
mapped_params["output_format"] = "audio-24khz-48kbitrate-mono-mp3"
|
||||
|
||||
# Map speed (OpenAI: 0.25-4.0, Azure: prosody rate)
|
||||
if "speed" in optional_params:
|
||||
speed = optional_params["speed"]
|
||||
if speed is not None:
|
||||
mapped_params["rate"] = self._convert_speed_to_azure_rate(speed=speed)
|
||||
|
||||
return mapped_params
|
||||
|
||||
def validate_environment(
|
||||
self,
|
||||
headers: dict,
|
||||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Validate Azure environment and set up authentication headers
|
||||
"""
|
||||
validated_headers = headers.copy()
|
||||
|
||||
# Azure AVA TTS requires either:
|
||||
# 1. Ocp-Apim-Subscription-Key header, or
|
||||
# 2. Authorization: Bearer <token> header
|
||||
|
||||
# We'll use the token-based auth via our token handler
|
||||
# The token will be added later in the handler
|
||||
|
||||
if api_key:
|
||||
# If subscription key is provided, use it directly
|
||||
validated_headers["Ocp-Apim-Subscription-Key"] = api_key
|
||||
|
||||
# Content-Type for SSML
|
||||
validated_headers["Content-Type"] = "application/ssml+xml"
|
||||
|
||||
# User-Agent
|
||||
validated_headers["User-Agent"] = "litellm"
|
||||
|
||||
return validated_headers
|
||||
|
||||
def get_complete_url(
|
||||
self,
|
||||
model: str,
|
||||
api_base: Optional[str],
|
||||
litellm_params: dict,
|
||||
) -> str:
|
||||
"""
|
||||
Get the complete URL for Azure AVA TTS request
|
||||
|
||||
Azure TTS endpoint format:
|
||||
https://{region}.tts.speech.microsoft.com/cognitiveservices/v1
|
||||
"""
|
||||
if api_base is None:
|
||||
raise ValueError(
|
||||
f"api_base is required for Azure AVA TTS. "
|
||||
f"Format: https://{{region}}.{self.COGNITIVE_SERVICES_DOMAIN} or "
|
||||
f"https://{{region}}.{self.TTS_SPEECH_DOMAIN}"
|
||||
)
|
||||
|
||||
# Remove trailing slash and parse URL
|
||||
api_base = api_base.rstrip("/")
|
||||
parsed_url = urlparse(api_base)
|
||||
hostname = parsed_url.hostname or ""
|
||||
|
||||
# Check if it's a Cognitive Services endpoint (convert to TTS endpoint)
|
||||
if self._is_cognitive_services_endpoint(hostname=hostname):
|
||||
region = self._extract_region_from_hostname(
|
||||
hostname=hostname,
|
||||
domain=self.COGNITIVE_SERVICES_DOMAIN
|
||||
)
|
||||
return self._build_tts_url(region=region)
|
||||
|
||||
# Check if it's already a TTS endpoint
|
||||
if self._is_tts_endpoint(hostname=hostname):
|
||||
if not api_base.endswith(self.TTS_ENDPOINT_PATH):
|
||||
return f"{api_base}{self.TTS_ENDPOINT_PATH}"
|
||||
return api_base
|
||||
|
||||
# Assume it's a custom endpoint, append the path
|
||||
return f"{api_base}{self.TTS_ENDPOINT_PATH}"
|
||||
|
||||
def _is_cognitive_services_endpoint(self, hostname: str) -> bool:
|
||||
"""Check if hostname is a Cognitive Services endpoint"""
|
||||
return (
|
||||
hostname == self.COGNITIVE_SERVICES_DOMAIN
|
||||
or hostname.endswith(f".{self.COGNITIVE_SERVICES_DOMAIN}")
|
||||
)
|
||||
|
||||
def _is_tts_endpoint(self, hostname: str) -> bool:
|
||||
"""Check if hostname is a TTS endpoint"""
|
||||
return (
|
||||
hostname == self.TTS_SPEECH_DOMAIN
|
||||
or hostname.endswith(f".{self.TTS_SPEECH_DOMAIN}")
|
||||
)
|
||||
|
||||
def _extract_region_from_hostname(self, hostname: str, domain: str) -> str:
|
||||
"""
|
||||
Extract region from hostname
|
||||
|
||||
Examples:
|
||||
eastus.api.cognitive.microsoft.com -> eastus
|
||||
api.cognitive.microsoft.com -> ""
|
||||
"""
|
||||
if hostname.endswith(f".{domain}"):
|
||||
return hostname[:-len(f".{domain}")]
|
||||
return ""
|
||||
|
||||
def _build_tts_url(self, region: str) -> str:
|
||||
"""Build the complete TTS URL with region"""
|
||||
if region:
|
||||
return f"https://{region}.{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}"
|
||||
return f"https://{self.TTS_SPEECH_DOMAIN}{self.TTS_ENDPOINT_PATH}"
|
||||
|
||||
def transform_text_to_speech_request(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[str],
|
||||
optional_params: Dict,
|
||||
litellm_params: Dict,
|
||||
headers: dict,
|
||||
) -> TextToSpeechRequestData:
|
||||
"""
|
||||
Transform OpenAI TTS request to Azure AVA TTS SSML format
|
||||
|
||||
Note: optional_params should already be mapped via map_openai_params in main.py
|
||||
|
||||
Returns:
|
||||
TextToSpeechRequestData: Contains SSML body and Azure-specific headers
|
||||
"""
|
||||
# Get voice (already mapped in main.py, or use default)
|
||||
azure_voice = optional_params.get("voice", "en-US-AriaNeural")
|
||||
|
||||
# Get output format (already mapped in main.py)
|
||||
output_format = optional_params.get(
|
||||
"output_format", "audio-24khz-48kbitrate-mono-mp3"
|
||||
)
|
||||
headers["X-Microsoft-OutputFormat"] = output_format
|
||||
|
||||
# Build SSML
|
||||
rate = optional_params.get("rate", "0%")
|
||||
|
||||
# Escape XML special characters in input text
|
||||
escaped_input = (
|
||||
input.replace("&", "&")
|
||||
.replace("<", "<")
|
||||
.replace(">", ">")
|
||||
.replace('"', """)
|
||||
.replace("'", "'")
|
||||
)
|
||||
|
||||
ssml_body = f"""
|
||||
<speak version='1.0' xml:lang='en-US'>
|
||||
<voice name='{azure_voice}'>
|
||||
<prosody rate='{rate}'>
|
||||
{escaped_input}
|
||||
</prosody>
|
||||
</voice>
|
||||
</speak>
|
||||
"""
|
||||
|
||||
return {
|
||||
"ssml_body": ssml_body,
|
||||
"headers": headers,
|
||||
}
|
||||
|
||||
def transform_text_to_speech_response(
|
||||
self,
|
||||
model: str,
|
||||
raw_response: httpx.Response,
|
||||
logging_obj: "LiteLLMLoggingObj",
|
||||
) -> "HttpxBinaryResponseContent":
|
||||
"""
|
||||
Transform Azure AVA TTS response to standard format
|
||||
|
||||
Azure returns the audio data directly in the response body
|
||||
"""
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
|
||||
# Azure returns audio data directly in the response body
|
||||
# Wrap it in HttpxBinaryResponseContent for consistent return type
|
||||
return HttpxBinaryResponseContent(raw_response)
|
||||
|
||||
@@ -0,0 +1,147 @@
|
||||
import types
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import TYPE_CHECKING, Any, Dict, Optional, TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from litellm.litellm_core_utils.litellm_logging import Logging as _LiteLLMLoggingObj
|
||||
from litellm.types.llms.openai import (
|
||||
HttpxBinaryResponseContent as _HttpxBinaryResponseContent,
|
||||
)
|
||||
|
||||
from ..chat.transformation import BaseLLMException as _BaseLLMException
|
||||
|
||||
LiteLLMLoggingObj = _LiteLLMLoggingObj
|
||||
BaseLLMException = _BaseLLMException
|
||||
HttpxBinaryResponseContent = _HttpxBinaryResponseContent
|
||||
else:
|
||||
LiteLLMLoggingObj = Any
|
||||
BaseLLMException = Any
|
||||
HttpxBinaryResponseContent = Any
|
||||
|
||||
|
||||
class TextToSpeechRequestData(TypedDict, total=False):
|
||||
"""
|
||||
Structured return type for text-to-speech transformations.
|
||||
|
||||
This ensures a consistent interface across all TTS providers.
|
||||
Providers should set ONE of: dict_body, ssml_body, or text_body.
|
||||
"""
|
||||
dict_body: Dict[str, Any] # JSON request body (e.g., OpenAI TTS)
|
||||
ssml_body: str # SSML/XML string body (e.g., Azure AVA TTS)
|
||||
headers: Dict[str, str] # Provider-specific headers to merge with base headers
|
||||
|
||||
|
||||
class BaseTextToSpeechConfig(ABC):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@classmethod
|
||||
def get_config(cls):
|
||||
return {
|
||||
k: v
|
||||
for k, v in cls.__dict__.items()
|
||||
if not k.startswith("__")
|
||||
and not k.startswith("_abc")
|
||||
and not isinstance(
|
||||
v,
|
||||
(
|
||||
types.FunctionType,
|
||||
types.BuiltinFunctionType,
|
||||
classmethod,
|
||||
staticmethod,
|
||||
),
|
||||
)
|
||||
and v is not None
|
||||
}
|
||||
|
||||
@abstractmethod
|
||||
def get_supported_openai_params(self, model: str) -> list:
|
||||
"""
|
||||
Get list of OpenAI TTS parameters supported by this provider
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def map_openai_params(
|
||||
self,
|
||||
model: str,
|
||||
optional_params: Dict,
|
||||
drop_params: bool,
|
||||
) -> Dict:
|
||||
"""
|
||||
Map OpenAI TTS parameters to provider-specific parameters
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def validate_environment(
|
||||
self,
|
||||
headers: dict,
|
||||
model: str,
|
||||
api_key: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""
|
||||
Validate environment and return headers
|
||||
"""
|
||||
return {}
|
||||
|
||||
@abstractmethod
|
||||
def get_complete_url(
|
||||
self,
|
||||
model: str,
|
||||
api_base: Optional[str],
|
||||
litellm_params: dict,
|
||||
) -> str:
|
||||
"""
|
||||
Get the complete url for the request
|
||||
"""
|
||||
if api_base is None:
|
||||
raise ValueError("api_base is required")
|
||||
return api_base
|
||||
|
||||
@abstractmethod
|
||||
def transform_text_to_speech_request(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[str],
|
||||
optional_params: Dict,
|
||||
litellm_params: Dict,
|
||||
headers: dict,
|
||||
) -> TextToSpeechRequestData:
|
||||
"""
|
||||
Transform request to provider-specific format.
|
||||
|
||||
Returns:
|
||||
TextToSpeechRequestData: A structured dict containing:
|
||||
- body: The request body (JSON dict, XML string, or binary data)
|
||||
- headers: Provider-specific headers to merge with base headers
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def transform_text_to_speech_response(
|
||||
self,
|
||||
model: str,
|
||||
raw_response: httpx.Response,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
) -> "HttpxBinaryResponseContent":
|
||||
"""
|
||||
Transform provider response to standard format
|
||||
"""
|
||||
pass
|
||||
|
||||
def get_error_class(
|
||||
self, error_message: str, status_code: int, headers: Dict
|
||||
) -> BaseLLMException:
|
||||
from ..chat.transformation import BaseLLMException
|
||||
|
||||
raise BaseLLMException(
|
||||
status_code=status_code,
|
||||
message=error_message,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
@@ -44,6 +44,9 @@ from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig, OCRResponse
|
||||
from litellm.llms.base_llm.realtime.transformation import BaseRealtimeConfig
|
||||
from litellm.llms.base_llm.rerank.transformation import BaseRerankConfig
|
||||
from litellm.llms.base_llm.responses.transformation import BaseResponsesAPIConfig
|
||||
from litellm.llms.base_llm.text_to_speech.transformation import (
|
||||
BaseTextToSpeechConfig,
|
||||
)
|
||||
from litellm.llms.base_llm.vector_store.transformation import BaseVectorStoreConfig
|
||||
from litellm.llms.custom_httpx.http_handler import (
|
||||
AsyncHTTPHandler,
|
||||
@@ -63,6 +66,7 @@ from litellm.types.llms.anthropic_messages.anthropic_response import (
|
||||
from litellm.types.llms.openai import (
|
||||
CreateBatchRequest,
|
||||
CreateFileRequest,
|
||||
HttpxBinaryResponseContent,
|
||||
OpenAIFileObject,
|
||||
ResponseInputParam,
|
||||
ResponsesAPIResponse,
|
||||
@@ -3281,6 +3285,7 @@ class BaseLLMHTTPHandler:
|
||||
BaseAnthropicMessagesConfig,
|
||||
BaseBatchesConfig,
|
||||
BaseOCRConfig,
|
||||
BaseTextToSpeechConfig,
|
||||
"BasePassthroughConfig",
|
||||
],
|
||||
):
|
||||
@@ -4343,3 +4348,222 @@ class BaseLLMHTTPHandler:
|
||||
raw_response=response,
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
|
||||
#####################################################################
|
||||
################ TEXT TO SPEECH HANDLER ###########################
|
||||
#####################################################################
|
||||
def text_to_speech_handler(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[str],
|
||||
text_to_speech_provider_config: BaseTextToSpeechConfig,
|
||||
text_to_speech_optional_params: Dict,
|
||||
custom_llm_provider: str,
|
||||
litellm_params: Dict,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
extra_headers: Optional[Dict[str, Any]] = None,
|
||||
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
|
||||
_is_async: bool = False,
|
||||
) -> Union[
|
||||
"HttpxBinaryResponseContent",
|
||||
Coroutine[Any, Any, "HttpxBinaryResponseContent"],
|
||||
]:
|
||||
"""
|
||||
Handles text-to-speech requests.
|
||||
When _is_async=True, returns a coroutine instead of making the call directly.
|
||||
"""
|
||||
if _is_async:
|
||||
return self.async_text_to_speech_handler(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice,
|
||||
text_to_speech_provider_config=text_to_speech_provider_config,
|
||||
text_to_speech_optional_params=text_to_speech_optional_params,
|
||||
custom_llm_provider=custom_llm_provider,
|
||||
litellm_params=litellm_params,
|
||||
logging_obj=logging_obj,
|
||||
extra_headers=extra_headers,
|
||||
timeout=timeout,
|
||||
client=client if isinstance(client, AsyncHTTPHandler) else None,
|
||||
)
|
||||
|
||||
if client is None or not isinstance(client, HTTPHandler):
|
||||
sync_httpx_client = _get_httpx_client(
|
||||
params={"ssl_verify": litellm_params.get("ssl_verify", None)}
|
||||
)
|
||||
else:
|
||||
sync_httpx_client = client
|
||||
|
||||
headers = text_to_speech_provider_config.validate_environment(
|
||||
api_key=litellm_params.get("api_key"),
|
||||
headers=extra_headers or {},
|
||||
model=model,
|
||||
api_base=litellm_params.get("api_base"),
|
||||
)
|
||||
|
||||
if extra_headers:
|
||||
headers.update(extra_headers)
|
||||
|
||||
api_base = text_to_speech_provider_config.get_complete_url(
|
||||
model=model,
|
||||
api_base=litellm_params.get("api_base"),
|
||||
litellm_params=litellm_params,
|
||||
)
|
||||
|
||||
request_data = text_to_speech_provider_config.transform_text_to_speech_request(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice,
|
||||
optional_params=text_to_speech_optional_params,
|
||||
litellm_params=litellm_params,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# Merge provider-specific headers
|
||||
if "headers" in request_data:
|
||||
headers.update(request_data["headers"])
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": request_data,
|
||||
"api_base": api_base,
|
||||
"headers": headers,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
# Determine request body type and send appropriately
|
||||
if "dict_body" in request_data:
|
||||
response = sync_httpx_client.post(
|
||||
url=api_base,
|
||||
headers=headers,
|
||||
json=request_data["dict_body"],
|
||||
timeout=timeout,
|
||||
)
|
||||
elif "ssml_body" in request_data:
|
||||
response = sync_httpx_client.post(
|
||||
url=api_base,
|
||||
headers=headers,
|
||||
data=request_data["ssml_body"],
|
||||
timeout=timeout,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"No body found in request_data. Must provide one of: dict_body, ssml_body, text_body, binary_body"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise self._handle_error(
|
||||
e=e,
|
||||
provider_config=text_to_speech_provider_config,
|
||||
)
|
||||
|
||||
return text_to_speech_provider_config.transform_text_to_speech_response(
|
||||
model=model,
|
||||
raw_response=response,
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
|
||||
async def async_text_to_speech_handler(
|
||||
self,
|
||||
model: str,
|
||||
input: str,
|
||||
voice: Optional[str],
|
||||
text_to_speech_provider_config: BaseTextToSpeechConfig,
|
||||
text_to_speech_optional_params: Dict,
|
||||
custom_llm_provider: str,
|
||||
litellm_params: Dict,
|
||||
logging_obj: LiteLLMLoggingObj,
|
||||
timeout: Union[float, httpx.Timeout],
|
||||
extra_headers: Optional[Dict[str, Any]] = None,
|
||||
client: Optional[Union[HTTPHandler, AsyncHTTPHandler]] = None,
|
||||
) -> "HttpxBinaryResponseContent":
|
||||
"""
|
||||
Async version of the text-to-speech handler.
|
||||
Uses async HTTP client to make requests.
|
||||
"""
|
||||
if client is None or not isinstance(client, AsyncHTTPHandler):
|
||||
async_httpx_client = get_async_httpx_client(
|
||||
llm_provider=litellm.LlmProviders(custom_llm_provider),
|
||||
params={"ssl_verify": litellm_params.get("ssl_verify", None)},
|
||||
)
|
||||
else:
|
||||
async_httpx_client = client
|
||||
|
||||
headers = text_to_speech_provider_config.validate_environment(
|
||||
api_key=litellm_params.get("api_key"),
|
||||
headers=extra_headers or {},
|
||||
model=model,
|
||||
api_base=litellm_params.get("api_base"),
|
||||
)
|
||||
|
||||
if extra_headers:
|
||||
headers.update(extra_headers)
|
||||
|
||||
api_base = text_to_speech_provider_config.get_complete_url(
|
||||
model=model,
|
||||
api_base=litellm_params.get("api_base"),
|
||||
litellm_params=litellm_params,
|
||||
)
|
||||
|
||||
request_data = text_to_speech_provider_config.transform_text_to_speech_request(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice,
|
||||
optional_params=text_to_speech_optional_params,
|
||||
litellm_params=litellm_params,
|
||||
headers=headers,
|
||||
)
|
||||
|
||||
# Merge provider-specific headers
|
||||
if "headers" in request_data:
|
||||
headers.update(request_data["headers"])
|
||||
|
||||
## LOGGING
|
||||
logging_obj.pre_call(
|
||||
input=input,
|
||||
api_key="",
|
||||
additional_args={
|
||||
"complete_input_dict": request_data,
|
||||
"api_base": api_base,
|
||||
"headers": headers,
|
||||
},
|
||||
)
|
||||
|
||||
try:
|
||||
# Determine request body type and send appropriately
|
||||
if "dict_body" in request_data:
|
||||
response = await async_httpx_client.post(
|
||||
url=api_base,
|
||||
headers=headers,
|
||||
json=request_data["dict_body"],
|
||||
timeout=timeout,
|
||||
)
|
||||
elif "ssml_body" in request_data:
|
||||
response = await async_httpx_client.post(
|
||||
url=api_base,
|
||||
headers=headers,
|
||||
data=request_data["ssml_body"],
|
||||
timeout=timeout,
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"No body found in request_data. Must provide one of: dict_body, ssml_body, text_body, binary_body"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
raise self._handle_error(
|
||||
e=e,
|
||||
provider_config=text_to_speech_provider_config,
|
||||
)
|
||||
|
||||
return text_to_speech_provider_config.transform_text_to_speech_response(
|
||||
model=model,
|
||||
raw_response=response,
|
||||
logging_obj=logging_obj,
|
||||
)
|
||||
|
||||
+89
-40
@@ -5690,12 +5690,28 @@ def speech( # noqa: PLR0915
|
||||
optional_params["speed"] = speed # type: ignore
|
||||
if instructions is not None:
|
||||
optional_params["instructions"] = instructions
|
||||
|
||||
if timeout is None:
|
||||
timeout = litellm.request_timeout
|
||||
|
||||
if max_retries is None:
|
||||
max_retries = litellm.num_retries or openai.DEFAULT_MAX_RETRIES
|
||||
litellm_params_dict = get_litellm_params(**kwargs)
|
||||
|
||||
# Get provider-specific text-to-speech config and map parameters
|
||||
text_to_speech_provider_config = ProviderConfigManager.get_provider_text_to_speech_config(
|
||||
model=model,
|
||||
provider=litellm.LlmProviders(custom_llm_provider),
|
||||
)
|
||||
|
||||
# Map OpenAI params to provider-specific params if config exists
|
||||
if text_to_speech_provider_config is not None:
|
||||
optional_params = text_to_speech_provider_config.map_openai_params(
|
||||
model=model,
|
||||
optional_params=optional_params,
|
||||
drop_params=False,
|
||||
)
|
||||
|
||||
logging_obj: Logging = cast(Logging, kwargs.get("litellm_logging_obj"))
|
||||
logging_obj.update_environment_variables(
|
||||
model=model,
|
||||
@@ -5769,52 +5785,85 @@ def speech( # noqa: PLR0915
|
||||
aspeech=aspeech,
|
||||
)
|
||||
elif custom_llm_provider == "azure":
|
||||
# azure configs
|
||||
if voice is None or not (isinstance(voice, str)):
|
||||
raise litellm.BadRequestError(
|
||||
message="'voice' is required to be passed as a string for Azure TTS",
|
||||
model=model,
|
||||
llm_provider=custom_llm_provider,
|
||||
# Check if this is Azure Speech Service (Cognitive Services TTS)
|
||||
if model.startswith("speech/"):
|
||||
from litellm.llms.azure.text_to_speech.transformation import (
|
||||
AzureAVATextToSpeechConfig,
|
||||
)
|
||||
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
|
||||
|
||||
api_version = api_version or litellm.api_version or get_secret("AZURE_API_VERSION") # type: ignore
|
||||
# Azure AVA (Cognitive Services) Text-to-Speech
|
||||
if text_to_speech_provider_config is None:
|
||||
raise litellm.BadRequestError(
|
||||
message="Azure Speech Service configuration not found",
|
||||
model=model,
|
||||
llm_provider=custom_llm_provider,
|
||||
)
|
||||
|
||||
api_key = (
|
||||
api_key
|
||||
or litellm.api_key
|
||||
or litellm.azure_key
|
||||
or get_secret("AZURE_OPENAI_API_KEY")
|
||||
or get_secret("AZURE_API_KEY")
|
||||
) # type: ignore
|
||||
# Cast to specific Azure config type to access dispatch method
|
||||
azure_config = cast(AzureAVATextToSpeechConfig, text_to_speech_provider_config)
|
||||
|
||||
response = azure_config.dispatch_text_to_speech( # type: ignore
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice,
|
||||
optional_params=optional_params,
|
||||
litellm_params_dict=litellm_params_dict,
|
||||
logging_obj=logging_obj,
|
||||
timeout=timeout,
|
||||
extra_headers=extra_headers,
|
||||
base_llm_http_handler=base_llm_http_handler,
|
||||
aspeech=aspeech or False,
|
||||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# Azure OpenAI TTS
|
||||
if voice is None or not (isinstance(voice, str)):
|
||||
raise litellm.BadRequestError(
|
||||
message="'voice' is required to be passed as a string for Azure TTS",
|
||||
model=model,
|
||||
llm_provider=custom_llm_provider,
|
||||
)
|
||||
api_base = api_base or litellm.api_base or get_secret("AZURE_API_BASE") # type: ignore
|
||||
|
||||
azure_ad_token: Optional[str] = optional_params.get("extra_body", {}).pop( # type: ignore
|
||||
"azure_ad_token", None
|
||||
) or get_secret(
|
||||
"AZURE_AD_TOKEN"
|
||||
)
|
||||
azure_ad_token_provider = kwargs.get("azure_ad_token_provider", None)
|
||||
api_version = api_version or litellm.api_version or get_secret("AZURE_API_VERSION") # type: ignore
|
||||
|
||||
if extra_headers:
|
||||
optional_params["extra_headers"] = extra_headers
|
||||
api_key = (
|
||||
api_key
|
||||
or litellm.api_key
|
||||
or litellm.azure_key
|
||||
or get_secret("AZURE_OPENAI_API_KEY")
|
||||
or get_secret("AZURE_API_KEY")
|
||||
) # type: ignore
|
||||
|
||||
response = azure_chat_completions.audio_speech(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice,
|
||||
optional_params=optional_params,
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
api_version=api_version,
|
||||
azure_ad_token=azure_ad_token,
|
||||
azure_ad_token_provider=azure_ad_token_provider,
|
||||
organization=organization,
|
||||
max_retries=max_retries,
|
||||
timeout=timeout,
|
||||
client=client, # pass AsyncOpenAI, OpenAI client
|
||||
aspeech=aspeech,
|
||||
litellm_params=litellm_params_dict,
|
||||
)
|
||||
azure_ad_token: Optional[str] = optional_params.get("extra_body", {}).pop( # type: ignore
|
||||
"azure_ad_token", None
|
||||
) or get_secret(
|
||||
"AZURE_AD_TOKEN"
|
||||
)
|
||||
azure_ad_token_provider = kwargs.get("azure_ad_token_provider", None)
|
||||
|
||||
if extra_headers:
|
||||
optional_params["extra_headers"] = extra_headers
|
||||
|
||||
response = azure_chat_completions.audio_speech(
|
||||
model=model,
|
||||
input=input,
|
||||
voice=voice,
|
||||
optional_params=optional_params,
|
||||
api_key=api_key,
|
||||
api_base=api_base,
|
||||
api_version=api_version,
|
||||
azure_ad_token=azure_ad_token,
|
||||
azure_ad_token_provider=azure_ad_token_provider,
|
||||
organization=organization,
|
||||
max_retries=max_retries,
|
||||
timeout=timeout,
|
||||
client=client, # pass AsyncOpenAI, OpenAI client
|
||||
aspeech=aspeech,
|
||||
litellm_params=litellm_params_dict,
|
||||
)
|
||||
elif custom_llm_provider == "vertex_ai" or custom_llm_provider == "vertex_ai_beta":
|
||||
generic_optional_params = GenericLiteLLMParams(**kwargs)
|
||||
|
||||
|
||||
@@ -17,8 +17,8 @@ from litellm.proxy.auth.user_api_key_auth import user_api_key_auth
|
||||
from litellm.proxy.common_request_processing import ProxyBaseLLMRequestProcessing
|
||||
from litellm.proxy.common_utils.http_parsing_utils import _read_request_body
|
||||
from litellm.proxy.common_utils.openai_endpoint_utils import (
|
||||
get_custom_llm_provider_from_request_query,
|
||||
get_custom_llm_provider_from_request_headers,
|
||||
get_custom_llm_provider_from_request_query,
|
||||
)
|
||||
from litellm.proxy.openai_files_endpoints.common_utils import (
|
||||
_is_base64_encoded_unified_file_id,
|
||||
@@ -372,11 +372,11 @@ async def list_batches(
|
||||
```
|
||||
"""
|
||||
from litellm.proxy.proxy_server import (
|
||||
general_settings,
|
||||
llm_router,
|
||||
proxy_config,
|
||||
proxy_logging_obj,
|
||||
version,
|
||||
general_settings,
|
||||
proxy_config,
|
||||
)
|
||||
|
||||
verbose_proxy_logger.debug("GET /v1/batches after={} limit={}".format(after, limit))
|
||||
@@ -429,9 +429,9 @@ async def list_batches(
|
||||
|
||||
## POST CALL HOOKS ###
|
||||
_response = await proxy_logging_obj.post_call_success_hook(
|
||||
data=data, user_api_key_dict=user_api_key_dict, response=response
|
||||
data=data, user_api_key_dict=user_api_key_dict, response=response # type: ignore
|
||||
)
|
||||
if _response is not None and type(response) == type(_response):
|
||||
if _response is not None and type(response) is type(_response):
|
||||
response = _response
|
||||
|
||||
### RESPONSE HEADERS ###
|
||||
|
||||
+26
-1
@@ -144,6 +144,9 @@ from litellm.llms.base_llm.google_genai.transformation import (
|
||||
BaseGoogleGenAIGenerateContentConfig,
|
||||
)
|
||||
from litellm.llms.base_llm.ocr.transformation import BaseOCRConfig
|
||||
from litellm.llms.base_llm.text_to_speech.transformation import (
|
||||
BaseTextToSpeechConfig,
|
||||
)
|
||||
from litellm.llms.bedrock.common_utils import BedrockModelInfo
|
||||
from litellm.llms.custom_httpx.http_handler import AsyncHTTPHandler, HTTPHandler
|
||||
from litellm.llms.mistral.ocr.transformation import MistralOCRConfig
|
||||
@@ -7218,7 +7221,9 @@ class ProviderConfigManager:
|
||||
elif litellm.LlmProviders.COMETAPI == provider:
|
||||
return litellm.CometAPIEmbeddingConfig()
|
||||
elif litellm.LlmProviders.SAGEMAKER == provider:
|
||||
from litellm.llms.sagemaker.embedding.transformation import SagemakerEmbeddingConfig
|
||||
from litellm.llms.sagemaker.embedding.transformation import (
|
||||
SagemakerEmbeddingConfig,
|
||||
)
|
||||
return SagemakerEmbeddingConfig.get_model_config(model)
|
||||
return None
|
||||
|
||||
@@ -7614,6 +7619,26 @@ class ProviderConfigManager:
|
||||
return None
|
||||
return config_class()
|
||||
|
||||
@staticmethod
|
||||
def get_provider_text_to_speech_config(
|
||||
model: str,
|
||||
provider: LlmProviders,
|
||||
) -> Optional["BaseTextToSpeechConfig"]:
|
||||
"""
|
||||
Get text-to-speech configuration for a given provider.
|
||||
"""
|
||||
from litellm.llms.base_llm.text_to_speech.transformation import (
|
||||
BaseTextToSpeechConfig,
|
||||
)
|
||||
|
||||
if litellm.LlmProviders.AZURE == provider:
|
||||
from litellm.llms.azure.text_to_speech.transformation import (
|
||||
AzureAVATextToSpeechConfig,
|
||||
)
|
||||
|
||||
return AzureAVATextToSpeechConfig()
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def get_provider_google_genai_generate_content_config(
|
||||
model: str,
|
||||
|
||||
Binary file not shown.
@@ -325,3 +325,53 @@ def test_audio_speech_gemini():
|
||||
)
|
||||
|
||||
print(result)
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
@pytest.mark.flaky(retries=3, delay=1)
|
||||
async def test_azure_ava_tts_async():
|
||||
"""
|
||||
Test Azure AVA (Cognitive Services) Text-to-Speech with real API request.
|
||||
"""
|
||||
litellm._turn_on_debug()
|
||||
api_key = os.getenv("AZURE_TTS_API_KEY")
|
||||
api_base = "https://eastus.tts.speech.microsoft.com"
|
||||
|
||||
|
||||
speech_file_path = Path(__file__).parent / "azure_speech.mp3"
|
||||
|
||||
try:
|
||||
response = await litellm.aspeech(
|
||||
model="azure/speech/azure-tts",
|
||||
voice="alloy",
|
||||
input="Hello, this is a test of Azure text to speech",
|
||||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
response_format="mp3",
|
||||
speed=1.0,
|
||||
)
|
||||
|
||||
# Assert the response is HttpxBinaryResponseContent
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
|
||||
assert isinstance(response, HttpxBinaryResponseContent)
|
||||
|
||||
# Get the binary content
|
||||
binary_content = response.content
|
||||
assert len(binary_content) > 0
|
||||
|
||||
# MP3 files start with these magic bytes
|
||||
# ID3 tag or MPEG sync word
|
||||
assert binary_content[:3] == b"ID3" or binary_content[:2] == b"\xff\xfb" or binary_content[:2] == b"\xff\xf3"
|
||||
|
||||
# Write to file
|
||||
response.stream_to_file(speech_file_path)
|
||||
|
||||
# Verify file was created and has content
|
||||
assert speech_file_path.exists()
|
||||
assert speech_file_path.stat().st_size > 0
|
||||
|
||||
print(f"Azure TTS audio saved to: {speech_file_path}")
|
||||
|
||||
except Exception as e:
|
||||
pytest.fail(f"Test failed with exception: {str(e)}")
|
||||
Binary file not shown.
@@ -0,0 +1,69 @@
|
||||
import os
|
||||
import sys
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(
|
||||
0, os.path.abspath("../../../../..")
|
||||
) # Adds the parent directory to the system path
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_realtime_uses_max_size_parameter():
|
||||
"""
|
||||
Test that Azure's async_realtime method uses the REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES
|
||||
constant for the max_size parameter to handle large base64 audio payloads.
|
||||
|
||||
This verifies the fix for: https://github.com/BerriAI/litellm/issues/15747
|
||||
"""
|
||||
from litellm.constants import REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES
|
||||
from litellm.llms.azure.realtime.handler import AzureOpenAIRealtime
|
||||
|
||||
handler = AzureOpenAIRealtime()
|
||||
api_base = "https://my-endpoint.openai.azure.com"
|
||||
api_key = "test-key"
|
||||
api_version = "2024-10-01-preview"
|
||||
model = "gpt-4o-realtime-preview"
|
||||
|
||||
dummy_websocket = AsyncMock()
|
||||
dummy_logging_obj = MagicMock()
|
||||
mock_backend_ws = AsyncMock()
|
||||
|
||||
class DummyAsyncContextManager:
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
async def __aenter__(self):
|
||||
return self.value
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return None
|
||||
|
||||
with patch("websockets.connect", return_value=DummyAsyncContextManager(mock_backend_ws)) as mock_ws_connect, \
|
||||
patch("litellm.llms.azure.realtime.handler.RealTimeStreaming") as mock_realtime_streaming:
|
||||
|
||||
mock_streaming_instance = MagicMock()
|
||||
mock_realtime_streaming.return_value = mock_streaming_instance
|
||||
mock_streaming_instance.bidirectional_forward = AsyncMock()
|
||||
|
||||
await handler.async_realtime(
|
||||
model=model,
|
||||
websocket=dummy_websocket,
|
||||
logging_obj=dummy_logging_obj,
|
||||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
api_version=api_version,
|
||||
)
|
||||
|
||||
# Verify websockets.connect was called with the max_size parameter
|
||||
mock_ws_connect.assert_called_once()
|
||||
called_kwargs = mock_ws_connect.call_args[1]
|
||||
|
||||
# Verify max_size is set (default None for unlimited, matching OpenAI's SDK)
|
||||
assert "max_size" in called_kwargs
|
||||
assert called_kwargs["max_size"] is None
|
||||
# Default should be None (unlimited) to match OpenAI's official agents SDK
|
||||
# https://github.com/openai/openai-agents-python/blob/cf1b933660e44fd37b4350c41febab8221801409/src/agents/realtime/openai_realtime.py#L235
|
||||
|
||||
mock_realtime_streaming.assert_called_once()
|
||||
mock_streaming_instance.bidirectional_forward.assert_awaited_once()
|
||||
|
||||
@@ -0,0 +1,284 @@
|
||||
from unittest.mock import Mock
|
||||
|
||||
import httpx
|
||||
import pytest
|
||||
|
||||
from litellm.llms.azure.text_to_speech.transformation import AzureAVATextToSpeechConfig
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def azure_tts_config() -> AzureAVATextToSpeechConfig:
|
||||
"""
|
||||
Fixture for AzureAVATextToSpeechConfig instance
|
||||
"""
|
||||
return AzureAVATextToSpeechConfig()
|
||||
|
||||
|
||||
# Tests for map_openai_params
|
||||
def test_map_openai_params_voice_mapping(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test mapping OpenAI voice to Azure AVA voice
|
||||
"""
|
||||
optional_params = {"voice": "alloy"}
|
||||
|
||||
mapped = azure_tts_config.map_openai_params(
|
||||
model="azure-tts",
|
||||
optional_params=optional_params,
|
||||
drop_params=False
|
||||
)
|
||||
|
||||
assert mapped["voice"] == "en-US-JennyNeural"
|
||||
|
||||
|
||||
def test_map_openai_params_custom_azure_voice(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test using custom Azure voice directly
|
||||
"""
|
||||
optional_params = {"voice": "en-GB-RyanNeural"}
|
||||
|
||||
mapped = azure_tts_config.map_openai_params(
|
||||
model="azure-tts",
|
||||
optional_params=optional_params,
|
||||
drop_params=False
|
||||
)
|
||||
|
||||
assert mapped["voice"] == "en-GB-RyanNeural"
|
||||
|
||||
|
||||
def test_map_openai_params_response_format(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test mapping OpenAI response format to Azure output format
|
||||
"""
|
||||
optional_params = {"response_format": "mp3"}
|
||||
|
||||
mapped = azure_tts_config.map_openai_params(
|
||||
model="azure-tts",
|
||||
optional_params=optional_params,
|
||||
drop_params=False
|
||||
)
|
||||
|
||||
assert mapped["output_format"] == "audio-24khz-48kbitrate-mono-mp3"
|
||||
|
||||
|
||||
def test_map_openai_params_default_format(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test default output format when none specified
|
||||
"""
|
||||
optional_params = {}
|
||||
|
||||
mapped = azure_tts_config.map_openai_params(
|
||||
model="azure-tts",
|
||||
optional_params=optional_params,
|
||||
drop_params=False
|
||||
)
|
||||
|
||||
assert mapped["output_format"] == "audio-24khz-48kbitrate-mono-mp3"
|
||||
|
||||
|
||||
def test_map_openai_params_speed(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test mapping OpenAI speed to Azure rate
|
||||
"""
|
||||
optional_params = {"speed": 1.5}
|
||||
|
||||
mapped = azure_tts_config.map_openai_params(
|
||||
model="azure-tts",
|
||||
optional_params=optional_params,
|
||||
drop_params=False
|
||||
)
|
||||
|
||||
# Speed 1.5 should map to +50%
|
||||
assert mapped["rate"] == "+50%"
|
||||
|
||||
|
||||
def test_map_openai_params_slow_speed(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test mapping slow speed to Azure rate
|
||||
"""
|
||||
optional_params = {"speed": 0.5}
|
||||
|
||||
mapped = azure_tts_config.map_openai_params(
|
||||
model="azure-tts",
|
||||
optional_params=optional_params,
|
||||
drop_params=False
|
||||
)
|
||||
|
||||
# Speed 0.5 should map to -50%
|
||||
assert mapped["rate"] == "-50%"
|
||||
|
||||
|
||||
# Tests for get_complete_url
|
||||
def test_get_complete_url_cognitive_services(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test converting Cognitive Services endpoint to TTS endpoint
|
||||
"""
|
||||
api_base = "https://eastus.api.cognitive.microsoft.com"
|
||||
|
||||
url = azure_tts_config.get_complete_url(
|
||||
model="azure-tts",
|
||||
api_base=api_base,
|
||||
litellm_params={}
|
||||
)
|
||||
|
||||
assert url == "https://eastus.tts.speech.microsoft.com/cognitiveservices/v1"
|
||||
|
||||
|
||||
def test_get_complete_url_tts_endpoint(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test using TTS endpoint directly
|
||||
"""
|
||||
api_base = "https://westus.tts.speech.microsoft.com"
|
||||
|
||||
url = azure_tts_config.get_complete_url(
|
||||
model="azure-tts",
|
||||
api_base=api_base,
|
||||
litellm_params={}
|
||||
)
|
||||
|
||||
assert url == "https://westus.tts.speech.microsoft.com/cognitiveservices/v1"
|
||||
|
||||
|
||||
def test_get_complete_url_tts_endpoint_with_path(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test TTS endpoint that already has the path
|
||||
"""
|
||||
api_base = "https://westus.tts.speech.microsoft.com/cognitiveservices/v1"
|
||||
|
||||
url = azure_tts_config.get_complete_url(
|
||||
model="azure-tts",
|
||||
api_base=api_base,
|
||||
litellm_params={}
|
||||
)
|
||||
|
||||
assert url == "https://westus.tts.speech.microsoft.com/cognitiveservices/v1"
|
||||
|
||||
|
||||
def test_get_complete_url_custom_endpoint(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test custom endpoint URL
|
||||
"""
|
||||
api_base = "https://custom.domain.com"
|
||||
|
||||
url = azure_tts_config.get_complete_url(
|
||||
model="azure-tts",
|
||||
api_base=api_base,
|
||||
litellm_params={}
|
||||
)
|
||||
|
||||
assert url == "https://custom.domain.com/cognitiveservices/v1"
|
||||
|
||||
|
||||
def test_get_complete_url_missing_api_base(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test error when api_base is missing
|
||||
"""
|
||||
with pytest.raises(ValueError, match="api_base is required"):
|
||||
azure_tts_config.get_complete_url(
|
||||
model="azure-tts",
|
||||
api_base=None,
|
||||
litellm_params={}
|
||||
)
|
||||
|
||||
|
||||
# Tests for transform_text_to_speech_request
|
||||
def test_transform_text_to_speech_request_basic(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test basic TTS request transformation
|
||||
"""
|
||||
result = azure_tts_config.transform_text_to_speech_request(
|
||||
model="azure-tts",
|
||||
input="Hello world",
|
||||
voice="en-US-AriaNeural",
|
||||
optional_params={"voice": "en-US-AriaNeural"},
|
||||
litellm_params={},
|
||||
headers={}
|
||||
)
|
||||
|
||||
assert "ssml_body" in result
|
||||
assert "Hello world" in result["ssml_body"]
|
||||
assert "en-US-AriaNeural" in result["ssml_body"]
|
||||
assert "<speak" in result["ssml_body"]
|
||||
assert "<voice" in result["ssml_body"]
|
||||
assert "<prosody" in result["ssml_body"]
|
||||
|
||||
|
||||
def test_transform_text_to_speech_request_with_rate(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test TTS request with custom rate
|
||||
"""
|
||||
result = azure_tts_config.transform_text_to_speech_request(
|
||||
model="azure-tts",
|
||||
input="Test message",
|
||||
voice="en-US-AriaNeural",
|
||||
optional_params={"voice": "en-US-AriaNeural", "rate": "+50%"},
|
||||
litellm_params={},
|
||||
headers={}
|
||||
)
|
||||
|
||||
assert "+50%" in result["ssml_body"]
|
||||
|
||||
|
||||
def test_transform_text_to_speech_request_xml_escaping(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test XML special characters are properly escaped
|
||||
"""
|
||||
input_text = "Test <tag> & 'quotes' \"double\""
|
||||
|
||||
result = azure_tts_config.transform_text_to_speech_request(
|
||||
model="azure-tts",
|
||||
input=input_text,
|
||||
voice="en-US-AriaNeural",
|
||||
optional_params={"voice": "en-US-AriaNeural"},
|
||||
litellm_params={},
|
||||
headers={}
|
||||
)
|
||||
|
||||
ssml = result["ssml_body"]
|
||||
assert "<tag>" in ssml
|
||||
assert "&" in ssml
|
||||
assert "'" in ssml
|
||||
assert """ in ssml
|
||||
|
||||
|
||||
def test_transform_text_to_speech_request_headers(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test that output format is added to headers
|
||||
"""
|
||||
result = azure_tts_config.transform_text_to_speech_request(
|
||||
model="azure-tts",
|
||||
input="Test",
|
||||
voice="en-US-AriaNeural",
|
||||
optional_params={
|
||||
"voice": "en-US-AriaNeural",
|
||||
"output_format": "audio-16khz-32kbitrate-mono-mp3"
|
||||
},
|
||||
litellm_params={},
|
||||
headers={}
|
||||
)
|
||||
|
||||
assert result["headers"]["X-Microsoft-OutputFormat"] == "audio-16khz-32kbitrate-mono-mp3"
|
||||
|
||||
|
||||
# Tests for transform_text_to_speech_response
|
||||
def test_transform_text_to_speech_response(azure_tts_config: AzureAVATextToSpeechConfig):
|
||||
"""
|
||||
Test TTS response transformation
|
||||
"""
|
||||
# Create a mock response
|
||||
mock_response = Mock(spec=httpx.Response)
|
||||
mock_response.content = b"fake_audio_data"
|
||||
mock_response.status_code = 200
|
||||
mock_response.headers = {"content-type": "audio/mpeg"}
|
||||
|
||||
mock_logging = Mock()
|
||||
|
||||
result = azure_tts_config.transform_text_to_speech_response(
|
||||
model="azure-tts",
|
||||
raw_response=mock_response,
|
||||
logging_obj=mock_logging
|
||||
)
|
||||
|
||||
# Should return HttpxBinaryResponseContent wrapper
|
||||
from litellm.types.llms.openai import HttpxBinaryResponseContent
|
||||
assert isinstance(result, HttpxBinaryResponseContent)
|
||||
|
||||
@@ -91,9 +91,10 @@ def test_openai_realtime_handler_model_parameter_inclusion():
|
||||
|
||||
|
||||
import asyncio
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, MagicMock, patch
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_realtime_success():
|
||||
@@ -197,3 +198,63 @@ async def test_async_realtime_url_contains_model():
|
||||
|
||||
mock_realtime_streaming.assert_called_once()
|
||||
mock_streaming_instance.bidirectional_forward.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_realtime_uses_max_size_parameter():
|
||||
"""
|
||||
Test that the async_realtime method uses the REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES
|
||||
constant for the max_size parameter to handle large base64 audio payloads.
|
||||
|
||||
This verifies the fix for: https://github.com/BerriAI/litellm/issues/15747
|
||||
"""
|
||||
from litellm.constants import REALTIME_WEBSOCKET_MAX_MESSAGE_SIZE_BYTES
|
||||
from litellm.llms.openai.realtime.handler import OpenAIRealtime
|
||||
from litellm.types.realtime import RealtimeQueryParams
|
||||
|
||||
handler = OpenAIRealtime()
|
||||
api_base = "https://api.openai.com/"
|
||||
api_key = "test-key"
|
||||
model = "gpt-4o-realtime-preview"
|
||||
query_params: RealtimeQueryParams = {"model": model}
|
||||
|
||||
dummy_websocket = AsyncMock()
|
||||
dummy_logging_obj = MagicMock()
|
||||
mock_backend_ws = AsyncMock()
|
||||
|
||||
class DummyAsyncContextManager:
|
||||
def __init__(self, value):
|
||||
self.value = value
|
||||
async def __aenter__(self):
|
||||
return self.value
|
||||
async def __aexit__(self, exc_type, exc, tb):
|
||||
return None
|
||||
|
||||
with patch("websockets.connect", return_value=DummyAsyncContextManager(mock_backend_ws)) as mock_ws_connect, \
|
||||
patch("litellm.llms.openai.realtime.handler.RealTimeStreaming") as mock_realtime_streaming:
|
||||
|
||||
mock_streaming_instance = MagicMock()
|
||||
mock_realtime_streaming.return_value = mock_streaming_instance
|
||||
mock_streaming_instance.bidirectional_forward = AsyncMock()
|
||||
|
||||
await handler.async_realtime(
|
||||
model=model,
|
||||
websocket=dummy_websocket,
|
||||
logging_obj=dummy_logging_obj,
|
||||
api_base=api_base,
|
||||
api_key=api_key,
|
||||
query_params=query_params,
|
||||
)
|
||||
|
||||
# Verify websockets.connect was called with the max_size parameter
|
||||
mock_ws_connect.assert_called_once()
|
||||
called_kwargs = mock_ws_connect.call_args[1]
|
||||
|
||||
# Verify max_size is set (default None for unlimited, matching OpenAI's SDK)
|
||||
assert "max_size" in called_kwargs
|
||||
assert called_kwargs["max_size"] is None
|
||||
# Default should be None (unlimited) to match OpenAI's official agents SDK
|
||||
# https://github.com/openai/openai-agents-python/blob/cf1b933660e44fd37b4350c41febab8221801409/src/agents/realtime/openai_realtime.py#L235
|
||||
|
||||
mock_realtime_streaming.assert_called_once()
|
||||
mock_streaming_instance.bidirectional_forward.assert_awaited_once()
|
||||
|
||||
Reference in New Issue
Block a user