from services.gemini_client import get_gemini_client from google.genai import types import base64 import logging logger = logging.getLogger(__name__) async def generate_tts(text: str) -> bytes: """ Convert text to speech using Gemini API. Args: text: Text to convert to speech Returns: Audio bytes in WAV format Raises: Exception: If TTS generation fails """ try: client = get_gemini_client() logger.info(f"Generating speech for: '{text}'") # For TTS, we need to use the specific TTS endpoint # Note: This might require different API calls based on Gemini's actual TTS API # Temporary fallback: Use regular model with text-to-speech request response = client.models.generate_content( model="gemini-2.0-flash-exp", contents=[f"Convert this to speech: {text}"], config=types.GenerateContentConfig( response_mime_type="audio/wav", ), ) # Extract audio data from response # This part depends on the actual Gemini TTS API response structure if (response.candidates and len(response.candidates) > 0 and response.candidates[0].content and response.candidates[0].content.parts and len(response.candidates[0].content.parts) > 0): part = response.candidates[0].content.parts[0] if hasattr(part, 'inline_data') and part.inline_data: audio_bytes = base64.b64decode(part.inline_data.data) else: # If no audio data, create a fallback audio or raise error raise Exception("No audio data in response") else: raise Exception("Invalid response format from TTS service") logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated") return audio_bytes except Exception as e: logger.error(f"✗ TTS failed: {str(e)}") # Fallback: Return a simple error message as text raise Exception(f"Text-to-speech generation failed: {str(e)}")