Spaces:
Sleeping
Sleeping
| import logging | |
| import numpy as np | |
| import soundfile as sf | |
| from typing import Optional, Generator, Tuple | |
| from utils.tts_base import TTSBase | |
| # Configure logging | |
| logger = logging.getLogger(__name__) | |
| # Flag to track CosyVoice2 availability | |
| COSYVOICE2_AVAILABLE = False | |
| DEFAULT_SAMPLE_RATE = 24000 | |
| # Try to import CosyVoice2 dependencies | |
| try: | |
| import torch | |
| import torchaudio | |
| # Import CosyVoice2 from the correct package | |
| # Based on https://github.com/FunAudioLLM/CosyVoice | |
| from cosyvoice.cli.cosyvoice import CosyVoice | |
| COSYVOICE2_AVAILABLE = True | |
| logger.info("CosyVoice2 TTS engine is available") | |
| except ImportError as e: | |
| logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}") | |
| COSYVOICE2_AVAILABLE = False | |
| except ModuleNotFoundError as e: | |
| logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}") | |
| COSYVOICE2_AVAILABLE = False | |
| def _get_model(): | |
| """Lazy-load the CosyVoice2 model | |
| Returns: | |
| CosyVoice2 or None: The CosyVoice2 model or None if not available | |
| """ | |
| if not COSYVOICE2_AVAILABLE: | |
| logger.warning("CosyVoice2 TTS engine is not available") | |
| return None | |
| try: | |
| import torch | |
| import torchaudio | |
| from cosyvoice.cli.cosyvoice import CosyVoice | |
| # Initialize the model with correct path | |
| model = CosyVoice('pretrained_models/CosyVoice-300M') | |
| logger.info("CosyVoice2 model successfully loaded") | |
| return model | |
| except ImportError as e: | |
| logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}") | |
| return None | |
| except FileNotFoundError as e: | |
| logger.error(f"Failed to load CosyVoice2 model files: {str(e)}") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}") | |
| return None | |
| class CosyVoice2TTS(TTSBase): | |
| """CosyVoice2 TTS engine implementation | |
| This engine uses the CosyVoice2 model for TTS generation. | |
| """ | |
| def __init__(self, lang_code: str = 'z'): | |
| """Initialize the CosyVoice2 TTS engine | |
| Args: | |
| lang_code (str): Language code for the engine | |
| """ | |
| super().__init__(lang_code) | |
| self.model = None | |
| def _ensure_model(self): | |
| """Ensure the model is loaded | |
| Returns: | |
| bool: True if model is available, False otherwise | |
| """ | |
| if self.model is None: | |
| self.model = _get_model() | |
| return self.model is not None | |
| def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]: | |
| """Generate speech using CosyVoice2 TTS engine | |
| Args: | |
| text (str): Input text to synthesize | |
| voice (str): Voice ID (may not be used in CosyVoice2) | |
| speed (float): Speech speed multiplier (may not be used in CosyVoice2) | |
| Returns: | |
| Optional[str]: Path to the generated audio file or None if generation fails | |
| """ | |
| logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}") | |
| # Check if CosyVoice2 is available | |
| if not COSYVOICE2_AVAILABLE: | |
| logger.error("CosyVoice2 TTS engine is not available") | |
| return None | |
| # Ensure model is loaded | |
| if not self._ensure_model(): | |
| logger.error("Failed to load CosyVoice2 model") | |
| return None | |
| try: | |
| import torch | |
| # Generate unique output path | |
| output_path = self._generate_output_path(prefix="cosyvoice2") | |
| # Generate audio using CosyVoice2 | |
| try: | |
| # Use the inference method from CosyVoice | |
| output_audio_tensor = self.model.inference_sft(text, '中文女') | |
| # Convert tensor to numpy array | |
| if isinstance(output_audio_tensor, torch.Tensor): | |
| output_audio_np = output_audio_tensor.cpu().numpy() | |
| else: | |
| output_audio_np = output_audio_tensor | |
| except Exception as api_error: | |
| # Try alternative API if the first one fails | |
| try: | |
| output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女') | |
| if isinstance(output_audio_tensor, torch.Tensor): | |
| output_audio_np = output_audio_tensor.cpu().numpy() | |
| else: | |
| output_audio_np = output_audio_tensor | |
| except Exception as alt_error: | |
| logger.error(f"CosyVoice2 inference failed: {str(api_error)}") | |
| return None | |
| if output_audio_np is not None: | |
| logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})") | |
| sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE) | |
| logger.info(f"CosyVoice2 audio generation complete: {output_path}") | |
| return output_path | |
| else: | |
| logger.error("CosyVoice2 model returned None for audio output") | |
| return None | |
| except Exception as e: | |
| logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True) | |
| return None | |
| def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]: | |
| """Generate speech stream using CosyVoice2 TTS engine | |
| Args: | |
| text (str): Input text to synthesize | |
| voice (str): Voice ID (may not be used in CosyVoice2) | |
| speed (float): Speech speed multiplier (may not be used in CosyVoice2) | |
| Yields: | |
| tuple: (sample_rate, audio_data) pairs for each segment | |
| """ | |
| logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}") | |
| # Check if CosyVoice2 is available | |
| if not COSYVOICE2_AVAILABLE: | |
| logger.error("CosyVoice2 TTS engine is not available") | |
| return | |
| # Ensure model is loaded | |
| if not self._ensure_model(): | |
| logger.error("Failed to load CosyVoice2 model") | |
| return | |
| try: | |
| import torch | |
| # Generate audio using CosyVoice2 | |
| try: | |
| # Use the inference method from CosyVoice | |
| output_audio_tensor = self.model.inference_sft(text, '中文女') | |
| # Convert tensor to numpy array | |
| if isinstance(output_audio_tensor, torch.Tensor): | |
| output_audio_np = output_audio_tensor.cpu().numpy() | |
| else: | |
| output_audio_np = output_audio_tensor | |
| except Exception as api_error: | |
| # Try alternative API if the first one fails | |
| try: | |
| output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女') | |
| if isinstance(output_audio_tensor, torch.Tensor): | |
| output_audio_np = output_audio_tensor.cpu().numpy() | |
| else: | |
| output_audio_np = output_audio_tensor | |
| except Exception as alt_error: | |
| logger.error(f"CosyVoice2 inference failed: {str(api_error)}") | |
| return | |
| if output_audio_np is not None: | |
| logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})") | |
| yield DEFAULT_SAMPLE_RATE, output_audio_np | |
| else: | |
| logger.error("CosyVoice2 model returned None for audio output") | |
| return | |
| except Exception as e: | |
| logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True) | |
| return |