|
|
from services.gemini_client import get_gemini_client |
|
|
from google.genai import types |
|
|
import base64 |
|
|
import logging |
|
|
|
|
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
|
|
|
async def generate_tts(text: str) -> bytes: |
|
|
""" |
|
|
Convert text to speech using Gemini API. |
|
|
|
|
|
Args: |
|
|
text: Text to convert to speech |
|
|
|
|
|
Returns: |
|
|
Audio bytes in WAV format |
|
|
|
|
|
Raises: |
|
|
Exception: If TTS generation fails |
|
|
""" |
|
|
try: |
|
|
client = get_gemini_client() |
|
|
|
|
|
logger.info(f"Generating speech for: '{text}'") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
response = client.models.generate_content( |
|
|
model="gemini-2.0-flash-exp", |
|
|
contents=[f"Convert this to speech: {text}"], |
|
|
config=types.GenerateContentConfig( |
|
|
response_mime_type="audio/wav", |
|
|
), |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
if (response.candidates and |
|
|
len(response.candidates) > 0 and |
|
|
response.candidates[0].content and |
|
|
response.candidates[0].content.parts and |
|
|
len(response.candidates[0].content.parts) > 0): |
|
|
|
|
|
part = response.candidates[0].content.parts[0] |
|
|
if hasattr(part, 'inline_data') and part.inline_data: |
|
|
audio_bytes = base64.b64decode(part.inline_data.data) |
|
|
else: |
|
|
|
|
|
raise Exception("No audio data in response") |
|
|
else: |
|
|
raise Exception("Invalid response format from TTS service") |
|
|
|
|
|
logger.info(f"β TTS successful: {len(audio_bytes)} bytes generated") |
|
|
|
|
|
return audio_bytes |
|
|
|
|
|
except Exception as e: |
|
|
logger.error(f"β TTS failed: {str(e)}") |
|
|
|
|
|
raise Exception(f"Text-to-speech generation failed: {str(e)}") |