Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

teachingAssistant / utils /tts_cosyvoice2.py

Michael Hu

attempt to fix cosyvoice2 tts

0c2d9e7 5 months ago

7.88 kB

	import logging
	import numpy as np
	import soundfile as sf
	from typing import Optional, Generator, Tuple

	from utils.tts_base import TTSBase

	# Configure logging
	logger = logging.getLogger(__name__)

	# Flag to track CosyVoice2 availability
	COSYVOICE2_AVAILABLE = False
	DEFAULT_SAMPLE_RATE = 24000

	# Try to import CosyVoice2 dependencies
	try:
	import torch
	import torchaudio
	# Import CosyVoice2 from the correct package
	# Based on https://github.com/FunAudioLLM/CosyVoice
	from cosyvoice.cli.cosyvoice import CosyVoice
	COSYVOICE2_AVAILABLE = True
	logger.info("CosyVoice2 TTS engine is available")
	except ImportError as e:
	logger.warning(f"CosyVoice2 TTS engine is not available - ImportError: {str(e)}")
	COSYVOICE2_AVAILABLE = False
	except ModuleNotFoundError as e:
	logger.warning(f"CosyVoice2 TTS engine is not available - ModuleNotFoundError: {str(e)}")
	COSYVOICE2_AVAILABLE = False


	def _get_model():
	"""Lazy-load the CosyVoice2 model

	Returns:
	CosyVoice2 or None: The CosyVoice2 model or None if not available
	"""
	if not COSYVOICE2_AVAILABLE:
	logger.warning("CosyVoice2 TTS engine is not available")
	return None

	try:
	import torch
	import torchaudio
	from cosyvoice.cli.cosyvoice import CosyVoice

	# Initialize the model with correct path
	model = CosyVoice('pretrained_models/CosyVoice-300M')
	logger.info("CosyVoice2 model successfully loaded")
	return model
	except ImportError as e:
	logger.error(f"Failed to import CosyVoice2 dependencies: {str(e)}")
	return None
	except FileNotFoundError as e:
	logger.error(f"Failed to load CosyVoice2 model files: {str(e)}")
	return None
	except Exception as e:
	logger.error(f"Failed to initialize CosyVoice2 model: {str(e)}")
	return None


	class CosyVoice2TTS(TTSBase):
	"""CosyVoice2 TTS engine implementation

	This engine uses the CosyVoice2 model for TTS generation.
	"""

	def __init__(self, lang_code: str = 'z'):
	"""Initialize the CosyVoice2 TTS engine

	Args:
	lang_code (str): Language code for the engine
	"""
	super().__init__(lang_code)
	self.model = None

	def _ensure_model(self):
	"""Ensure the model is loaded

	Returns:
	bool: True if model is available, False otherwise
	"""
	if self.model is None:
	self.model = _get_model()

	return self.model is not None

	def generate_speech(self, text: str, voice: str = 'default', speed: float = 1.0) -> Optional[str]:
	"""Generate speech using CosyVoice2 TTS engine

	Args:
	text (str): Input text to synthesize
	voice (str): Voice ID (may not be used in CosyVoice2)
	speed (float): Speech speed multiplier (may not be used in CosyVoice2)

	Returns:
	Optional[str]: Path to the generated audio file or None if generation fails
	"""
	logger.info(f"Generating speech with CosyVoice2 for text length: {len(text)}")

	# Check if CosyVoice2 is available
	if not COSYVOICE2_AVAILABLE:
	logger.error("CosyVoice2 TTS engine is not available")
	return None

	# Ensure model is loaded
	if not self._ensure_model():
	logger.error("Failed to load CosyVoice2 model")
	return None

	try:
	import torch

	# Generate unique output path
	output_path = self._generate_output_path(prefix="cosyvoice2")

	# Generate audio using CosyVoice2
	try:
	# Use the inference method from CosyVoice
	output_audio_tensor = self.model.inference_sft(text, '中文女')

	# Convert tensor to numpy array
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as api_error:
	# Try alternative API if the first one fails
	try:
	output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as alt_error:
	logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
	return None

	if output_audio_np is not None:
	logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
	sf.write(output_path, output_audio_np, DEFAULT_SAMPLE_RATE)
	logger.info(f"CosyVoice2 audio generation complete: {output_path}")
	return output_path
	else:
	logger.error("CosyVoice2 model returned None for audio output")
	return None

	except Exception as e:
	logger.error(f"Error generating speech with CosyVoice2: {str(e)}", exc_info=True)
	return None

	def generate_speech_stream(self, text: str, voice: str = 'default', speed: float = 1.0) -> Generator[Tuple[int, np.ndarray], None, None]:
	"""Generate speech stream using CosyVoice2 TTS engine

	Args:
	text (str): Input text to synthesize
	voice (str): Voice ID (may not be used in CosyVoice2)
	speed (float): Speech speed multiplier (may not be used in CosyVoice2)

	Yields:
	tuple: (sample_rate, audio_data) pairs for each segment
	"""
	logger.info(f"Generating speech stream with CosyVoice2 for text length: {len(text)}")

	# Check if CosyVoice2 is available
	if not COSYVOICE2_AVAILABLE:
	logger.error("CosyVoice2 TTS engine is not available")
	return

	# Ensure model is loaded
	if not self._ensure_model():
	logger.error("Failed to load CosyVoice2 model")
	return

	try:
	import torch

	# Generate audio using CosyVoice2
	try:
	# Use the inference method from CosyVoice
	output_audio_tensor = self.model.inference_sft(text, '中文女')

	# Convert tensor to numpy array
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as api_error:
	# Try alternative API if the first one fails
	try:
	output_audio_tensor = self.model.inference_zero_shot(text, '请输入提示文本', '中文女')
	if isinstance(output_audio_tensor, torch.Tensor):
	output_audio_np = output_audio_tensor.cpu().numpy()
	else:
	output_audio_np = output_audio_tensor
	except Exception as alt_error:
	logger.error(f"CosyVoice2 inference failed: {str(api_error)}")
	return

	if output_audio_np is not None:
	logger.info(f"Successfully generated audio with CosyVoice2 (length: {len(output_audio_np)})")
	yield DEFAULT_SAMPLE_RATE, output_audio_np
	else:
	logger.error("CosyVoice2 model returned None for audio output")
	return

	except Exception as e:
	logger.error(f"Error generating speech stream with CosyVoice2: {str(e)}", exc_info=True)
	return