Spaces:
Sleeping
Sleeping
Michael Hu
feat: replace legacy TTS providers with Chatterbox as the single, default provider
237cb26
| """Speech synthesis service interface. | |
| This module defines the interface for text-to-speech (TTS) services that convert | |
| textual content into audio. The interface supports both batch and streaming | |
| synthesis with multiple voice options and quality settings. | |
| The interface is designed to be: | |
| - Voice-flexible: Supports multiple voices and languages | |
| - Quality-configurable: Allows control over synthesis parameters | |
| - Streaming-capable: Supports real-time audio generation | |
| - Provider-agnostic: Works with any TTS implementation | |
| """ | |
| from abc import ABC, abstractmethod | |
| from typing import Iterator, TYPE_CHECKING | |
| if TYPE_CHECKING: | |
| from ..models.speech_synthesis_request import SpeechSynthesisRequest | |
| from ..models.audio_content import AudioContent | |
| from ..models.audio_chunk import AudioChunk | |
| class ISpeechSynthesisService(ABC): | |
| """Interface for speech synthesis services. | |
| This interface defines the contract for converting text to speech using | |
| various TTS models and voices. Implementations should support both batch | |
| processing and streaming synthesis for different use cases. | |
| Example: | |
| ```python | |
| # Use through dependency injection | |
| tts_service = container.resolve(ISpeechSynthesisService) | |
| # Create synthesis request | |
| request = SpeechSynthesisRequest( | |
| text_content=text_content, | |
| voice_settings=voice_settings | |
| ) | |
| # Batch synthesis | |
| audio = tts_service.synthesize(request) | |
| # Or streaming synthesis | |
| for chunk in tts_service.synthesize_stream(request): | |
| # Process audio chunk in real-time | |
| play_audio_chunk(chunk) | |
| ``` | |
| """ | |
| def synthesize(self, request: 'SpeechSynthesisRequest') -> 'AudioContent': | |
| """Synthesize speech from text in batch mode. | |
| Converts text content to audio using specified voice settings and | |
| returns the complete audio content. This method is suitable for | |
| shorter texts or when the complete audio is needed before playback. | |
| Implementation considerations: | |
| - Text preprocessing (SSML support, pronunciation handling) | |
| - Voice loading and configuration | |
| - Audio quality optimization | |
| - Memory management for long texts | |
| - Error recovery and fallback voices | |
| Args: | |
| request: The speech synthesis request containing: | |
| - text_content: Text to synthesize with language information | |
| - voice_settings: Voice configuration including voice ID, speed, | |
| pitch, volume, and other voice-specific parameters | |
| Returns: | |
| AudioContent: The synthesized audio containing: | |
| - data: Raw audio data in specified format | |
| - format: Audio format (WAV, MP3, etc.) | |
| - sample_rate: Audio sample rate in Hz | |
| - duration: Audio duration in seconds | |
| - metadata: Additional synthesis information | |
| Raises: | |
| SpeechSynthesisException: If synthesis fails due to: | |
| - Unsupported voice or language | |
| - Text processing errors (invalid characters, length limits) | |
| - Voice model loading failures | |
| - Insufficient system resources | |
| ValueError: If request parameters are invalid: | |
| - Empty text content | |
| - Unsupported voice settings | |
| - Invalid audio format specifications | |
| Example: | |
| ```python | |
| # Create text content | |
| text = TextContent( | |
| text="Hello, this is a test of speech synthesis.", | |
| language="en" | |
| ) | |
| # Configure voice settings | |
| voice_settings = VoiceSettings( | |
| voice_id="chatterbox", | |
| speed=1.0, | |
| pitch=0.0, | |
| volume=1.0 | |
| ) | |
| # Create synthesis request | |
| request = SpeechSynthesisRequest( | |
| text_content=text, | |
| voice_settings=voice_settings | |
| ) | |
| # Synthesize audio | |
| try: | |
| audio = service.synthesize(request) | |
| # Save to file | |
| with open("output.wav", "wb") as f: | |
| f.write(audio.data) | |
| print(f"Generated {audio.duration:.1f}s of audio") | |
| except SpeechSynthesisException as e: | |
| print(f"Synthesis failed: {e}") | |
| ``` | |
| """ | |
| pass | |
| def synthesize_stream(self, request: 'SpeechSynthesisRequest') -> Iterator['AudioChunk']: | |
| """Synthesize speech from text as a stream of audio chunks. | |
| Converts text content to audio in streaming mode, yielding audio chunks | |
| as they become available. This method is suitable for real-time playback, | |
| long texts, or when low latency is required. | |
| Implementation considerations: | |
| - Chunk size optimization for smooth playback | |
| - Buffer management and memory efficiency | |
| - Error handling without breaking the stream | |
| - Proper stream termination and cleanup | |
| - Latency minimization for real-time use cases | |
| Args: | |
| request: The speech synthesis request containing text and voice settings. | |
| Same format as batch synthesis but optimized for streaming. | |
| Yields: | |
| AudioChunk: Individual audio chunks containing: | |
| - data: Raw audio data for this chunk | |
| - format: Audio format (consistent across chunks) | |
| - sample_rate: Audio sample rate in Hz | |
| - chunk_index: Sequential chunk number | |
| - is_final: Boolean indicating if this is the last chunk | |
| - timestamp: Chunk generation timestamp | |
| Raises: | |
| SpeechSynthesisException: If synthesis fails during streaming: | |
| - Voice model errors during processing | |
| - Network issues (for cloud-based synthesis) | |
| - Resource exhaustion during long synthesis | |
| ValueError: If request parameters are invalid for streaming | |
| Example: | |
| ```python | |
| # Create streaming synthesis request | |
| request = SpeechSynthesisRequest( | |
| text_content=long_text, | |
| voice_settings=voice_settings | |
| ) | |
| # Stream synthesis with real-time playback | |
| audio_buffer = [] | |
| try: | |
| for chunk in service.synthesize_stream(request): | |
| # Add to playback buffer | |
| audio_buffer.append(chunk.data) | |
| # Start playback when buffer is sufficient | |
| if len(audio_buffer) >= 3: # Buffer 3 chunks | |
| play_audio_chunk(audio_buffer.pop(0)) | |
| # Handle final chunk | |
| if chunk.is_final: | |
| # Play remaining buffered chunks | |
| for remaining in audio_buffer: | |
| play_audio_chunk(remaining) | |
| break | |
| except SpeechSynthesisException as e: | |
| print(f"Streaming synthesis failed: {e}") | |
| ``` | |
| """ | |
| pass |