File size: 2,206 Bytes
73d4f3c
91b1985
4f1c42b
4a13628
c7fc3b6
4a13628
9aa985d
4f1c42b
4a13628
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
918acab
 
 
 
4a13628
918acab
 
4a13628
918acab
4a13628
 
 
918acab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a13628
 
 
 
 
 
 
918acab
4a13628
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
from services.gemini_client import get_gemini_client
from google.genai import types
import base64
import logging

logger = logging.getLogger(__name__)


async def generate_tts(text: str) -> bytes:
    """
    Convert text to speech using Gemini API.
    
    Args:
        text: Text to convert to speech
    
    Returns:
        Audio bytes in WAV format
    
    Raises:
        Exception: If TTS generation fails
    """
    try:
        client = get_gemini_client()
        
        logger.info(f"Generating speech for: '{text}'")
        
        # For TTS, we need to use the specific TTS endpoint
        # Note: This might require different API calls based on Gemini's actual TTS API
        
        # Temporary fallback: Use regular model with text-to-speech request
        response = client.models.generate_content(
            model="gemini-2.0-flash-exp",
            contents=[f"Convert this to speech: {text}"],
            config=types.GenerateContentConfig(
                response_mime_type="audio/wav",
            ),
        )
        
        # Extract audio data from response
        # This part depends on the actual Gemini TTS API response structure
        if (response.candidates and 
            len(response.candidates) > 0 and
            response.candidates[0].content and
            response.candidates[0].content.parts and
            len(response.candidates[0].content.parts) > 0):
            
            part = response.candidates[0].content.parts[0]
            if hasattr(part, 'inline_data') and part.inline_data:
                audio_bytes = base64.b64decode(part.inline_data.data)
            else:
                # If no audio data, create a fallback audio or raise error
                raise Exception("No audio data in response")
        else:
            raise Exception("Invalid response format from TTS service")
        
        logger.info(f"✓ TTS successful: {len(audio_bytes)} bytes generated")
        
        return audio_bytes
        
    except Exception as e:
        logger.error(f"✗ TTS failed: {str(e)}")
        # Fallback: Return a simple error message as text
        raise Exception(f"Text-to-speech generation failed: {str(e)}")