Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on Jul 29, 2025

Commit

0c2d9e7

1 Parent(s): 74466cd

attempt to fix cosyvoice2 tts

Browse files

Files changed (4) hide show

src/application/error_handling/error_mapper.py +13 -0
src/application/services/audio_processing_service.py +31 -6
src/infrastructure/tts/cosyvoice2_provider.py +87 -39
utils/tts_cosyvoice2.py +82 -61

src/application/error_handling/error_mapper.py CHANGED Viewed

@@ -202,6 +202,19 @@ class ErrorMapper:
                     "Retry the operation",
                     "Check system load and try again later"
                 ]
             )
         }

                     "Retry the operation",
                     "Check system load and try again later"
                 ]
+            ),
+            # Type errors
+            TypeError: ErrorMapping(
+                user_message="Invalid data type provided. This is likely a configuration or implementation issue.",
+                error_code="TYPE_ERROR",
+                severity=ErrorSeverity.HIGH,
+                category=ErrorCategory.SYSTEM,
+                recovery_suggestions=[
+                    "Retry the operation",
+                    "Try a different voice or model",
+                    "Contact support if the issue persists"
+                ]
             )
         }

src/application/services/audio_processing_service.py CHANGED Viewed

@@ -53,7 +53,7 @@ class AudioProcessingApplicationService:
         """
         try:
             logger.info("Initializing AudioProcessingApplicationService...")
             self._container = container
             self._config = config or container.resolve(AppConfig)
             self._temp_files: Dict[str, str] = {}  # Track temporary files for cleanup
@@ -66,7 +66,7 @@ class AudioProcessingApplicationService:
             # self._setup_logging()
             logger.info("AudioProcessingApplicationService initialized successfully")
         except Exception as e:
             print(f"Error: Failed to initialize AudioProcessingApplicationService: {e}")
             raise
@@ -520,32 +520,42 @@ class AudioProcessingApplicationService:
         """
         try:
             logger.info(
-                f"Starting TTS with voice: {voice}, speed: {speed} "
                 f"[correlation_id={correlation_id}]"
             )
             # Get TTS provider from container
             tts_provider = self._container.get_tts_provider(voice)
             # Create voice settings
             voice_settings = VoiceSettings(
                 voice_id=voice,
                 speed=speed,
                 language=language
             )
             # Create synthesis request
             synthesis_request = SpeechSynthesisRequest(
-                text=text.text,
                 voice_settings=voice_settings
             )
             # Perform synthesis
             audio_content = tts_provider.synthesize(synthesis_request)
             # Save output to file
             output_filename = f"output_{correlation_id}.{audio_content.format}"
             output_path = os.path.join(temp_dir, output_filename)
             with open(output_path, 'wb') as f:
                 f.write(audio_content.data)
@@ -561,7 +571,7 @@ class AudioProcessingApplicationService:
             return output_path
         except Exception as e:
-            logger.error(f"TTS failed: {e} [correlation_id={correlation_id}]")
             raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}")
     def _get_error_code_from_exception(self, exception: Exception) -> str:
@@ -792,11 +802,23 @@ class AudioProcessingApplicationService:
             component="AudioProcessingApplicationService"
         )
         def tts_operation():
-            return self._perform_speech_synthesis(text, voice, speed, language, temp_dir, correlation_id)
         try:
             # Try with circuit breaker protection
             return self._recovery_manager.execute_with_circuit_breaker(
                 tts_operation,
                 f"tts_{voice}",
@@ -805,6 +827,8 @@ class AudioProcessingApplicationService:
             )
         except Exception as e:
             # Try fallback TTS providers
             tts_config = self._config.get_tts_config()
             fallback_voices = [v for v in tts_config['preferred_providers'] if v != voice]
@@ -829,4 +853,5 @@ class AudioProcessingApplicationService:
                     correlation_id
                 )
             else:
                 raise

         """
         try:
             logger.info("Initializing AudioProcessingApplicationService...")
             self._container = container
             self._config = config or container.resolve(AppConfig)
             self._temp_files: Dict[str, str] = {}  # Track temporary files for cleanup
             # self._setup_logging()
             logger.info("AudioProcessingApplicationService initialized successfully")
         except Exception as e:
             print(f"Error: Failed to initialize AudioProcessingApplicationService: {e}")
             raise
         """
         try:
             logger.info(
+                f"Starting TTS with voice: {voice}, speed: {speed}, language: {language} "
                 f"[correlation_id={correlation_id}]"
             )
+            logger.info(f"Text to synthesize length: {len(text.text)} characters")
             # Get TTS provider from container
+            logger.info(f"Getting TTS provider for voice: {voice}")
             tts_provider = self._container.get_tts_provider(voice)
+            logger.info(f"TTS provider obtained: {tts_provider.__class__.__name__}")
             # Create voice settings
+            logger.info("Creating voice settings")
             voice_settings = VoiceSettings(
                 voice_id=voice,
                 speed=speed,
                 language=language
             )
+            logger.info(f"Voice settings created: {voice_settings}")
             # Create synthesis request
+            logger.info("Creating synthesis request")
             synthesis_request = SpeechSynthesisRequest(
+                text_content=text,  # text is already a TextContent object
                 voice_settings=voice_settings
             )
+            logger.info("Synthesis request created successfully")
             # Perform synthesis
+            logger.info("Starting TTS synthesis")
             audio_content = tts_provider.synthesize(synthesis_request)
+            logger.info(f"TTS synthesis completed, audio format: {audio_content.format}, data length: {len(audio_content.data)}")
             # Save output to file
             output_filename = f"output_{correlation_id}.{audio_content.format}"
             output_path = os.path.join(temp_dir, output_filename)
+            logger.info(f"Saving audio to: {output_path}")
             with open(output_path, 'wb') as f:
                 f.write(audio_content.data)
             return output_path
         except Exception as e:
+            logger.error(f"TTS failed: {e} [correlation_id={correlation_id}]", exc_info=True)
             raise SpeechSynthesisException(f"Speech synthesis failed: {str(e)}")
     def _get_error_code_from_exception(self, exception: Exception) -> str:
             component="AudioProcessingApplicationService"
         )
+        logger.info(f"Starting TTS synthesis with recovery [correlation_id={correlation_id}]")
+        logger.info(f"Parameters: voice={voice}, speed={speed}, language={language}")
+        logger.info(f"Text type: {type(text)}, Text content type: {type(text.text) if hasattr(text, 'text') else 'N/A'}")
         def tts_operation():
+            logger.info(f"Executing TTS operation [correlation_id={correlation_id}]")
+            try:
+                result = self._perform_speech_synthesis(text, voice, speed, language, temp_dir, correlation_id)
+                logger.info(f"TTS operation completed successfully [correlation_id={correlation_id}]")
+                return result
+            except Exception as e:
+                logger.error(f"TTS operation failed: {str(e)} [correlation_id={correlation_id}]", exc_info=True)
+                raise
         try:
             # Try with circuit breaker protection
+            logger.info(f"Attempting TTS with circuit breaker [correlation_id={correlation_id}]")
             return self._recovery_manager.execute_with_circuit_breaker(
                 tts_operation,
                 f"tts_{voice}",
             )
         except Exception as e:
+            logger.error(f"Primary TTS failed, trying fallbacks: {str(e)} [correlation_id={correlation_id}]", exc_info=True)
             # Try fallback TTS providers
             tts_config = self._config.get_tts_config()
             fallback_voices = [v for v in tts_config['preferred_providers'] if v != voice]
                     correlation_id
                 )
             else:
+                logger.error(f"No fallback voices available [correlation_id={correlation_id}]")
                 raise

src/infrastructure/tts/cosyvoice2_provider.py CHANGED Viewed

@@ -21,15 +21,17 @@ DEFAULT_SAMPLE_RATE = 24000
 # Try to import CosyVoice2 dependencies
 try:
     import torch
-    # Import CosyVoice2 - assuming it's installed and has a similar API to Dia
-    # since they're both from nari-labs according to the GitHub link
-    from cosyvoice2.model import CosyVoice2
     COSYVOICE2_AVAILABLE = True
     logger.info("CosyVoice2 TTS engine is available")
-except ImportError:
-    logger.warning("CosyVoice2 TTS engine is not available")
 except ModuleNotFoundError as e:
-    logger.warning(f"CosyVoice2 TTS engine is not available: {str(e)}")
     COSYVOICE2_AVAILABLE = False
@@ -49,20 +51,28 @@ class CosyVoice2TTSProvider(TTSProviderBase):
         """Ensure the model is loaded."""
         if self.model is None and COSYVOICE2_AVAILABLE:
             try:
                 import torch
-                from cosyvoice2.model import CosyVoice2
-                self.model = CosyVoice2.from_pretrained()
                 logger.info("CosyVoice2 model successfully loaded")
             except ImportError as e:
-                logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
                 self.model = None
             except FileNotFoundError as e:
-                logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
                 self.model = None
             except Exception as e:
-                logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
                 self.model = None
-        return self.model is not None
     def is_available(self) -> bool:
         """Check if CosyVoice2 TTS is available."""
@@ -75,36 +85,66 @@ class CosyVoice2TTSProvider(TTSProviderBase):
     def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
         """Generate audio using CosyVoice2 TTS."""
         if not self.is_available():
             raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
         try:
             import torch
             # Extract parameters from request
             text = request.text_content.text
             # Generate audio using CosyVoice2
-            with torch.inference_mode():
-                # Assuming CosyVoice2 has a similar API to Dia
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    use_torch_compile=False,
-                    verbose=False
-                )
             if output_audio_np is None:
                 raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
             # Convert numpy array to bytes
             audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
             return audio_bytes, DEFAULT_SAMPLE_RATE
         except Exception as e:
             self._handle_provider_error(e, "audio generation")
     def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
@@ -114,22 +154,30 @@ class CosyVoice2TTSProvider(TTSProviderBase):
         try:
             import torch
             # Extract parameters from request
             text = request.text_content.text
             # Generate audio using CosyVoice2
-            with torch.inference_mode():
-                # Assuming CosyVoice2 has a similar API to Dia
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    use_torch_compile=False,
-                    verbose=False
-                )
             if output_audio_np is None:
                 raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
@@ -147,13 +195,13 @@ class CosyVoice2TTSProvider(TTSProviderBase):
         try:
             # Create an in-memory buffer
             buffer = io.BytesIO()
             # Write audio data to buffer as WAV
             sf.write(buffer, audio_array, sample_rate, format='WAV')
             # Get bytes from buffer
             buffer.seek(0)
             return buffer.read()
         except Exception as e:
             raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e

 # Try to import CosyVoice2 dependencies
 try:
     import torch
+    import torchaudio
+    # Import CosyVoice2 from the correct package
+    # Based on https://github.com/FunAudioLLM/CosyVoice
+    from cosyvoice.cli.cosyvoice import CosyVoice
     COSYVOICE2_AVAILABLE = True
     logger.info("CosyVoice2 TTS engine is available")
+except ImportError as e:
+    logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
+    COSYVOICE2_AVAILABLE = False
 except ModuleNotFoundError as e:
+    logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
     COSYVOICE2_AVAILABLE = False
         """Ensure the model is loaded."""
         if self.model is None and COSYVOICE2_AVAILABLE:
             try:
+                logger.info("Loading CosyVoice2 model...")
                 import torch
+                import torchaudio
+                from cosyvoice.cli.cosyvoice import CosyVoice
+                # Initialize CosyVoice with the correct model path
+                # You may need to adjust the model path based on your installation
+                self.model = CosyVoice('pretrained_models/CosyVoice-300M')
                 logger.info("CosyVoice2 model successfully loaded")
             except ImportError as e:
+                logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}", exc_info=True)
                 self.model = None
             except FileNotFoundError as e:
+                logger.error(f"Failed to load CosyVoice2 model files: {str(e)}", exc_info=True)
                 self.model = None
             except Exception as e:
+                logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}", exc_info=True)
                 self.model = None
+        model_available = self.model is not None
+        logger.info(f"CosyVoice2 model availability check: {model_available}")
+        return model_available
     def is_available(self) -> bool:
         """Check if CosyVoice2 TTS is available."""
     def _generate_audio(self, request: 'SpeechSynthesisRequest') -> tuple[bytes, int]:
         """Generate audio using CosyVoice2 TTS."""
+        logger.info("Starting CosyVoice2 audio generation")
         if not self.is_available():
+            logger.error("CosyVoice2 TTS engine is not available")
             raise SpeechSynthesisException("CosyVoice2 TTS engine is not available")
         try:
             import torch
             # Extract parameters from request
             text = request.text_content.text
+            logger.info(f"CosyVoice2 generating audio for text length: {len(text)}")
+            logger.info(f"Voice settings: voice_id={request.voice_settings.voice_id}, speed={request.voice_settings.speed}")
             # Generate audio using CosyVoice2
+            logger.info("Starting CosyVoice2 model inference")
+            # CosyVoice API - using inference method
+            # The model expects text and returns audio tensor
+            try:
+                # Use the inference method from CosyVoice
+                output_audio_tensor = self.model.inference_sft(text, '中文女')
+                # Convert tensor to numpy array
+                if isinstance(output_audio_tensor, torch.Tensor):
+                    output_audio_np = output_audio_tensor.cpu().numpy()
+                else:
+                    output_audio_np = output_audio_tensor
+                logger.info("CosyVoice2 model inference completed")
+            except Exception as api_error:
+                logger.error(f"CosyVoice2 API error: {str(api_error)}")
+                # Try alternative API if the first one fails
+                try:
+                    logger.info("Trying alternative CosyVoice2 API")
+                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
+                    if isinstance(output_audio_tensor, torch.Tensor):
+                        output_audio_np = output_audio_tensor.cpu().numpy()
+                    else:
+                        output_audio_np = output_audio_tensor
+                    logger.info("CosyVoice2 alternative API succeeded")
+                except Exception as alt_error:
+                    logger.error(f"CosyVoice2 alternative API also failed: {str(alt_error)}")
+                    raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
             if output_audio_np is None:
+                logger.error("CosyVoice2 model returned None for audio output")
                 raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
+            logger.info(f"CosyVoice2 generated audio array shape: {output_audio_np.shape if hasattr(output_audio_np, 'shape') else 'unknown'}")
             # Convert numpy array to bytes
+            logger.info("Converting CosyVoice2 audio to bytes")
             audio_bytes = self._numpy_to_bytes(output_audio_np, sample_rate=DEFAULT_SAMPLE_RATE)
+            logger.info(f"CosyVoice2 audio conversion completed, bytes length: {len(audio_bytes)}")
             return audio_bytes, DEFAULT_SAMPLE_RATE
         except Exception as e:
+            logger.error(f"CosyVoice2 audio generation failed: {str(e)}", exc_info=True)
             self._handle_provider_error(e, "audio generation")
     def _generate_audio_stream(self, request: 'SpeechSynthesisRequest') -> Iterator[tuple[bytes, int, bool]]:
         try:
             import torch
             # Extract parameters from request
             text = request.text_content.text
             # Generate audio using CosyVoice2
+            try:
+                # Use the inference method from CosyVoice
+                output_audio_tensor = self.model.inference_sft(text, '中文女')
+                # Convert tensor to numpy array
+                if isinstance(output_audio_tensor, torch.Tensor):
+                    output_audio_np = output_audio_tensor.cpu().numpy()
+                else:
+                    output_audio_np = output_audio_tensor
+            except Exception as api_error:
+                # Try alternative API if the first one fails
+                try:
+                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
+                    if isinstance(output_audio_tensor, torch.Tensor):
+                        output_audio_np = output_audio_tensor.cpu().numpy()
+                    else:
+                        output_audio_np = output_audio_tensor
+                except Exception as alt_error:
+                    raise SpeechSynthesisException(f"CosyVoice2 inference failed: {str(api_error)}")
             if output_audio_np is None:
                 raise SpeechSynthesisException("CosyVoice2 model returned None for audio output")
         try:
             # Create an in-memory buffer
             buffer = io.BytesIO()
             # Write audio data to buffer as WAV
             sf.write(buffer, audio_array, sample_rate, format='WAV')
             # Get bytes from buffer
             buffer.seek(0)
             return buffer.read()
         except Exception as e:
             raise SpeechSynthesisException(f"Failed to convert audio to bytes: {str(e)}") from e

utils/tts_cosyvoice2.py CHANGED Viewed

@@ -15,34 +15,37 @@ DEFAULT_SAMPLE_RATE = 24000
 # Try to import CosyVoice2 dependencies
 try:
     import torch
-    # Import CosyVoice2 - assuming it's installed and has a similar API to Dia
-    # since they're both from nari-labs according to the GitHub link
-    from cosyvoice2.model import CosyVoice2
     COSYVOICE2_AVAILABLE = True
     logger.info("CosyVoice2 TTS engine is available")
-except ImportError:
-    logger.warning("CosyVoice2 TTS engine is not available")
 except ModuleNotFoundError as e:
-    logger.warning(f"CosyVoice2 TTS engine is not available: {str(e)}")
     COSYVOICE2_AVAILABLE = False
 def _get_model():
     """Lazy-load the CosyVoice2 model
     Returns:
         CosyVoice2 or None: The CosyVoice2 model or None if not available
     """
     if not COSYVOICE2_AVAILABLE:
         logger.warning("CosyVoice2 TTS engine is not available")
         return None
     try:
         import torch
-        from cosyvoice2.model import CosyVoice2
-        # Initialize the model
-        model = CosyVoice2.from_pretrained()
         logger.info("CosyVoice2 model successfully loaded")
         return model
     except ImportError as e:
@@ -58,72 +61,81 @@ def _get_model():
 class CosyVoice2TTS(TTSBase):
     """CosyVoice2 TTS engine implementation
     This engine uses the CosyVoice2 model for TTS generation.
     """
     def __init__(self, lang_code: str = 'z'):
         """Initialize the CosyVoice2 TTS engine
         Args:
             lang_code (str): Language code for the engine
         """
         super().__init__(lang_code)
         self.model = None
     def _ensure_model(self):
         """Ensure the model is loaded
         Returns:
             bool: True if model is available, False otherwise
         """
         if self.model is None:
             self.model = _get_model()
         return self.model is not None
     def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
         """Generate speech using CosyVoice2 TTS engine
         Args:
             text (str): Input text to synthesize
             voice (str): Voice ID (may not be used in CosyVoice2)
             speed (float): Speech speed multiplier (may not be used in CosyVoice2)
         Returns:
             Optional[str]: Path to the generated audio file or None if generation fails
         """
         logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
         # Check if CosyVoice2 is available
         if not COSYVOICE2_AVAILABLE:
             logger.error("CosyVoice2 TTS engine is not available")
             return None
         # Ensure model is loaded
         if not self._ensure_model():
             logger.error("Failed to load CosyVoice2 model")
             return None
         try:
             import torch
             # Generate unique output path
             output_path = self._generate_output_path(prefix="cosyvoice2")
-            # Generate audio
-            with torch.inference_mode():
-                # Assuming CosyVoice2 has a similar API to Dia
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    use_torch_compile=False,
-                    verbose=False
-                )
             if output_audio_np is not None:
                 logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                 sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
@@ -132,57 +144,66 @@ class CosyVoice2TTS(TTSBase):
             else:
                 logger.error("CosyVoice2 model returned None for audio output")
                 return None
         except Exception as e:
             logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
             return None
     def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
         """Generate speech stream using CosyVoice2 TTS engine
         Args:
             text (str): Input text to synthesize
             voice (str): Voice ID (may not be used in CosyVoice2)
             speed (float): Speech speed multiplier (may not be used in CosyVoice2)
         Yields:
             tuple: (sample_rate, audio_data) pairs for each segment
         """
         logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
         # Check if CosyVoice2 is available
         if not COSYVOICE2_AVAILABLE:
             logger.error("CosyVoice2 TTS engine is not available")
             return
         # Ensure model is loaded
         if not self._ensure_model():
             logger.error("Failed to load CosyVoice2 model")
             return
         try:
             import torch
-            # Generate audio
-            with torch.inference_mode():
-                # Assuming CosyVoice2 has a similar API to Dia
-                output_audio_np = self.model.generate(
-                    text,
-                    max_tokens=None,
-                    cfg_scale=3.0,
-                    temperature=1.3,
-                    top_p=0.95,
-                    use_torch_compile=False,
-                    verbose=False
-                )
             if output_audio_np is not None:
                 logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                 yield DEFAULT_SAMPLE_RATE, output_audio_np
             else:
                 logger.error("CosyVoice2 model returned None for audio output")
                 return
         except Exception as e:
             logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
             return

 # Try to import CosyVoice2 dependencies
 try:
     import torch
+    import torchaudio
+    # Import CosyVoice2 from the correct package
+    # Based on https://github.com/FunAudioLLM/CosyVoice
+    from cosyvoice.cli.cosyvoice import CosyVoice
     COSYVOICE2_AVAILABLE = True
     logger.info("CosyVoice2 TTS engine is available")
+except ImportError as e:
+    logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
+    COSYVOICE2_AVAILABLE = False
 except ModuleNotFoundError as e:
+    logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
     COSYVOICE2_AVAILABLE = False
 def _get_model():
     """Lazy-load the CosyVoice2 model
     Returns:
         CosyVoice2 or None: The CosyVoice2 model or None if not available
     """
     if not COSYVOICE2_AVAILABLE:
         logger.warning("CosyVoice2 TTS engine is not available")
         return None
     try:
         import torch
+        import torchaudio
+        from cosyvoice.cli.cosyvoice import CosyVoice
+        # Initialize the model with correct path
+        model = CosyVoice('pretrained_models/CosyVoice-300M')
         logger.info("CosyVoice2 model successfully loaded")
         return model
     except ImportError as e:
 class CosyVoice2TTS(TTSBase):
     """CosyVoice2 TTS engine implementation
     This engine uses the CosyVoice2 model for TTS generation.
     """
     def __init__(self, lang_code: str = 'z'):
         """Initialize the CosyVoice2 TTS engine
         Args:
             lang_code (str): Language code for the engine
         """
         super().__init__(lang_code)
         self.model = None
     def _ensure_model(self):
         """Ensure the model is loaded
         Returns:
             bool: True if model is available, False otherwise
         """
         if self.model is None:
             self.model = _get_model()
         return self.model is not None
     def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
         """Generate speech using CosyVoice2 TTS engine
         Args:
             text (str): Input text to synthesize
             voice (str): Voice ID (may not be used in CosyVoice2)
             speed (float): Speech speed multiplier (may not be used in CosyVoice2)
         Returns:
             Optional[str]: Path to the generated audio file or None if generation fails
         """
         logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")
         # Check if CosyVoice2 is available
         if not COSYVOICE2_AVAILABLE:
             logger.error("CosyVoice2 TTS engine is not available")
             return None
         # Ensure model is loaded
         if not self._ensure_model():
             logger.error("Failed to load CosyVoice2 model")
             return None
         try:
             import torch
             # Generate unique output path
             output_path = self._generate_output_path(prefix="cosyvoice2")
+            # Generate audio using CosyVoice2
+            try:
+                # Use the inference method from CosyVoice
+                output_audio_tensor = self.model.inference_sft(text, '中文女')
+                # Convert tensor to numpy array
+                if isinstance(output_audio_tensor, torch.Tensor):
+                    output_audio_np = output_audio_tensor.cpu().numpy()
+                else:
+                    output_audio_np = output_audio_tensor
+            except Exception as api_error:
+                # Try alternative API if the first one fails
+                try:
+                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
+                    if isinstance(output_audio_tensor, torch.Tensor):
+                        output_audio_np = output_audio_tensor.cpu().numpy()
+                    else:
+                        output_audio_np = output_audio_tensor
+                except Exception as alt_error:
+                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
+                    return None
             if output_audio_np is not None:
                 logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                 sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
             else:
                 logger.error("CosyVoice2 model returned None for audio output")
                 return None
         except Exception as e:
             logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
             return None
     def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
         """Generate speech stream using CosyVoice2 TTS engine
         Args:
             text (str): Input text to synthesize
             voice (str): Voice ID (may not be used in CosyVoice2)
             speed (float): Speech speed multiplier (may not be used in CosyVoice2)
         Yields:
             tuple: (sample_rate, audio_data) pairs for each segment
         """
         logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")
         # Check if CosyVoice2 is available
         if not COSYVOICE2_AVAILABLE:
             logger.error("CosyVoice2 TTS engine is not available")
             return
         # Ensure model is loaded
         if not self._ensure_model():
             logger.error("Failed to load CosyVoice2 model")
             return
         try:
             import torch
+            # Generate audio using CosyVoice2
+            try:
+                # Use the inference method from CosyVoice
+                output_audio_tensor = self.model.inference_sft(text, '中文女')
+                # Convert tensor to numpy array
+                if isinstance(output_audio_tensor, torch.Tensor):
+                    output_audio_np = output_audio_tensor.cpu().numpy()
+                else:
+                    output_audio_np = output_audio_tensor
+            except Exception as api_error:
+                # Try alternative API if the first one fails
+                try:
+                    output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
+                    if isinstance(output_audio_tensor, torch.Tensor):
+                        output_audio_np = output_audio_tensor.cpu().numpy()
+                    else:
+                        output_audio_np = output_audio_tensor
+                except Exception as alt_error:
+                    logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
+                    return
             if output_audio_np is not None:
                 logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
                 yield DEFAULT_SAMPLE_RATE, output_audio_np
             else:
                 logger.error("CosyVoice2 model returned None for audio output")
                 return
         except Exception as e:
             logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
             return