Spaces:

NLP-Debater-Project
/

FastAPI-Backend-Models

Running

File size: 2,206 Bytes

from services.gemini_client import get_gemini_client
from google.genai import types
import base64
import logging

logger = logging.getLogger(__name__)


async def generate_tts(text: str) -> bytes:
    """
    Convert text to speech using Gemini API.
    
    Args:
        text: Text to convert to speech
    
    Returns:
        Audio bytes in WAV format
    
    Raises:
        Exception: If TTS generation fails
    """
    try:
        client = get_gemini_client()
        
        logger.info(f"Generating speech for: '{text}'")
        
        # For TTS, we need to use the specific TTS endpoint
        # Note: This might require different API calls based on Gemini's actual TTS API
        
        # Temporary fallback: Use regular model with text-to-speech request
        response = client.models.generate_content(
            model="gemini-2.0-flash-exp",
            contents=[f"Convert this to speech: {text}"],
            config=types.GenerateContentConfig(
                response_mime_type="audio/wav",
            ),
        )
        
        # Extract audio data from response
        # This part depends on the actual Gemini TTS API response structure
        if (response.candidates and 
            len(response.candidates) > 0 and
            response.candidates[0].content and
            response.candidates[0].content.parts and
            len(response.candidates[0].content.parts) > 0):
            
            part = response.candidates[0].content.parts[0]
            if hasattr(part, 'inline_data') and part.inline_data:
                audio_bytes = base64.b64decode(part.inline_data.data)
            else:
                # If no audio data, create a fallback audio or raise error
                raise Exception("No audio data in response")
        else:
            raise Exception("Invalid response format from TTS service")
        
        logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
        
        return audio_bytes
        
    except Exception as e:
        logger.error(f"✗ TTS failed: {str(e)}")
        # Fallback: Return a simple error message as text
        raise Exception(f"Text-to-speech generation failed: {str(e)}")