Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

teachingAssistant / src /domain /interfaces /speech_synthesis.py

Michael Hu

feat: replace legacy TTS providers with Chatterbox as the single, default provider

237cb26 3 months ago

7.33 kB

	"""Speech synthesis service interface.

	This module defines the interface for text-to-speech (TTS) services that convert
	textual content into audio. The interface supports both batch and streaming
	synthesis with multiple voice options and quality settings.

	The interface is designed to be:
	- Voice-flexible: Supports multiple voices and languages
	- Quality-configurable: Allows control over synthesis parameters
	- Streaming-capable: Supports real-time audio generation
	- Provider-agnostic: Works with any TTS implementation
	"""

	from abc import ABC, abstractmethod
	from typing import Iterator, TYPE_CHECKING

	if TYPE_CHECKING:
	from ..models.speech_synthesis_request import SpeechSynthesisRequest
	from ..models.audio_content import AudioContent
	from ..models.audio_chunk import AudioChunk


	class ISpeechSynthesisService(ABC):
	"""Interface for speech synthesis services.

	This interface defines the contract for converting text to speech using
	various TTS models and voices. Implementations should support both batch
	processing and streaming synthesis for different use cases.

	Example:
	```python
	# Use through dependency injection
	tts_service = container.resolve(ISpeechSynthesisService)

	# Create synthesis request
	request = SpeechSynthesisRequest(
	text_content=text_content,
	voice_settings=voice_settings
	)

	# Batch synthesis
	audio = tts_service.synthesize(request)

	# Or streaming synthesis
	for chunk in tts_service.synthesize_stream(request):
	# Process audio chunk in real-time
	play_audio_chunk(chunk)
	```
	"""

	@abstractmethod
	def synthesize(self, request: 'SpeechSynthesisRequest') -> 'AudioContent':
	"""Synthesize speech from text in batch mode.

	Converts text content to audio using specified voice settings and
	returns the complete audio content. This method is suitable for
	shorter texts or when the complete audio is needed before playback.

	Implementation considerations:
	- Text preprocessing (SSML support, pronunciation handling)
	- Voice loading and configuration
	- Audio quality optimization
	- Memory management for long texts
	- Error recovery and fallback voices

	Args:
	request: The speech synthesis request containing:
	- text_content: Text to synthesize with language information
	- voice_settings: Voice configuration including voice ID, speed,
	pitch, volume, and other voice-specific parameters

	Returns:
	AudioContent: The synthesized audio containing:
	- data: Raw audio data in specified format
	- format: Audio format (WAV, MP3, etc.)
	- sample_rate: Audio sample rate in Hz
	- duration: Audio duration in seconds
	- metadata: Additional synthesis information

	Raises:
	SpeechSynthesisException: If synthesis fails due to:
	- Unsupported voice or language
	- Text processing errors (invalid characters, length limits)
	- Voice model loading failures
	- Insufficient system resources
	ValueError: If request parameters are invalid:
	- Empty text content
	- Unsupported voice settings
	- Invalid audio format specifications

	Example:
	```python
	# Create text content
	text = TextContent(
	text="Hello, this is a test of speech synthesis.",
	language="en"
	)

	# Configure voice settings
	voice_settings = VoiceSettings(
	voice_id="chatterbox",
	speed=1.0,
	pitch=0.0,
	volume=1.0
	)

	# Create synthesis request
	request = SpeechSynthesisRequest(
	text_content=text,
	voice_settings=voice_settings
	)

	# Synthesize audio
	try:
	audio = service.synthesize(request)

	# Save to file
	with open("output.wav", "wb") as f:
	f.write(audio.data)

	print(f"Generated {audio.duration:.1f}s of audio")

	except SpeechSynthesisException as e:
	print(f"Synthesis failed: {e}")
	```
	"""
	pass

	@abstractmethod
	def synthesize_stream(self, request: 'SpeechSynthesisRequest') -> Iterator['AudioChunk']:
	"""Synthesize speech from text as a stream of audio chunks.

	Converts text content to audio in streaming mode, yielding audio chunks
	as they become available. This method is suitable for real-time playback,
	long texts, or when low latency is required.

	Implementation considerations:
	- Chunk size optimization for smooth playback
	- Buffer management and memory efficiency
	- Error handling without breaking the stream
	- Proper stream termination and cleanup
	- Latency minimization for real-time use cases

	Args:
	request: The speech synthesis request containing text and voice settings.
	Same format as batch synthesis but optimized for streaming.

	Yields:
	AudioChunk: Individual audio chunks containing:
	- data: Raw audio data for this chunk
	- format: Audio format (consistent across chunks)
	- sample_rate: Audio sample rate in Hz
	- chunk_index: Sequential chunk number
	- is_final: Boolean indicating if this is the last chunk
	- timestamp: Chunk generation timestamp

	Raises:
	SpeechSynthesisException: If synthesis fails during streaming:
	- Voice model errors during processing
	- Network issues (for cloud-based synthesis)
	- Resource exhaustion during long synthesis
	ValueError: If request parameters are invalid for streaming

	Example:
	```python
	# Create streaming synthesis request
	request = SpeechSynthesisRequest(
	text_content=long_text,
	voice_settings=voice_settings
	)

	# Stream synthesis with real-time playback
	audio_buffer = []

	try:
	for chunk in service.synthesize_stream(request):
	# Add to playback buffer
	audio_buffer.append(chunk.data)

	# Start playback when buffer is sufficient
	if len(audio_buffer) >= 3: # Buffer 3 chunks
	play_audio_chunk(audio_buffer.pop(0))

	# Handle final chunk
	if chunk.is_final:
	# Play remaining buffered chunks
	for remaining in audio_buffer:
	play_audio_chunk(remaining)
	break

	except SpeechSynthesisException as e:
	print(f"Streaming synthesis failed: {e}")
	```
	"""
	pass