Michael Hu
fix: remove hard-coded ASR model list and make ASR model optional
3ad3808
raw
history blame
11.8 kB
"""
Main entry point for the Audio Translation Web Application
Handles file upload, processing pipeline, and UI rendering using DDD architecture with Gradio
"""
import logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler("app.log"),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
import gradio as gr
import os
import json
from typing import Optional, Tuple, Dict, Any
# Import application services and DTOs
from src.application.services.audio_processing_service import AudioProcessingApplicationService
from src.application.services.configuration_service import ConfigurationApplicationService
from src.application.dtos.audio_upload_dto import AudioUploadDto
from src.application.dtos.processing_request_dto import ProcessingRequestDto
from src.application.dtos.processing_result_dto import ProcessingResultDto
# Import infrastructure setup
from src.infrastructure.config.container_setup import initialize_global_container, get_global_container
# Initialize environment configurations
os.makedirs("temp/uploads", exist_ok=True)
os.makedirs("temp/outputs", exist_ok=True)
# Global container initialization
container_initialized = False
def initialize_application():
"""Initialize the application with dependency injection container"""
global container_initialized
if not container_initialized:
try:
logger.info("Initializing application container")
initialize_global_container()
container_initialized = True
logger.info("Application container initialized successfully")
except Exception as e:
logger.error(f"Failed to initialize application: {e}")
raise RuntimeError(f"Application initialization failed: {str(e)}")
def create_audio_upload_dto(audio_file_path: str) -> AudioUploadDto:
"""
Create AudioUploadDto from audio file path.
Args:
audio_file_path: Path to the uploaded audio file
Returns:
AudioUploadDto: DTO containing upload information
"""
try:
if not audio_file_path or not os.path.exists(audio_file_path):
raise ValueError("No audio file provided or file does not exist")
filename = os.path.basename(audio_file_path)
logger.info(f"Creating AudioUploadDto for file: {filename}")
logger.info(f"Full file path: {audio_file_path}")
with open(audio_file_path, 'rb') as f:
content = f.read()
# Determine content type based on file extension
file_ext = os.path.splitext(filename.lower())[1]
logger.info(f"Detected file extension: {file_ext}")
content_type_map = {
'.wav': 'audio/wav',
'.mp3': 'audio/mpeg',
'.m4a': 'audio/mp4',
'.flac': 'audio/flac',
'.ogg': 'audio/ogg'
}
content_type = content_type_map.get(file_ext, 'audio/wav')
logger.info(f"Mapped content type: {content_type}")
# Log file size info
file_size = len(content)
logger.info(f"File size: {file_size} bytes ({file_size / 1024 / 1024:.2f} MB)")
return AudioUploadDto(
filename=filename,
content=content,
content_type=content_type,
size=len(content)
)
except Exception as e:
logger.error(f"Failed to create AudioUploadDto: {e}")
raise ValueError(f"Invalid audio file: {str(e)}")
def get_supported_configurations() -> dict:
"""
Get supported configurations from application service.
Returns:
dict: Supported configurations
"""
try:
logger.info("Getting global container...")
container = get_global_container()
logger.info("Resolving AudioProcessingApplicationService...")
audio_service = container.resolve(AudioProcessingApplicationService)
logger.info("Getting supported configurations from service...")
config = audio_service.get_supported_configurations()
logger.info(f"Retrieved configurations: {config}")
return config
except Exception as e:
logger.error(f"Failed to get configurations: {e}", exc_info=True)
logger.warning("Using fallback configurations - this may indicate a configuration service issue")
# Return fallback configurations
fallback_config = {
'voices': ['chatterbox'],
'languages': ['en', 'zh'],
'audio_formats': ['wav', 'mp3', 'm4a', 'flac', 'ogg'], # Updated to include all supported formats
'max_file_size_mb': 100,
'speed_range': {'min': 0.5, 'max': 2.0}
}
logger.info(f"Using fallback configuration: {fallback_config}")
return fallback_config
def process_audio_pipeline(
audio_file,
target_language: str,
voice: str,
speed: float,
source_language: str = "en"
) -> Tuple[str, str, str, str, str]:
"""
Execute the complete processing pipeline using application services.
Args:
audio_file: Gradio audio file input
asr_model: ASR model to use
target_language: Target language for translation
voice: Voice for TTS
speed: Speech speed
source_language: Source language
Returns:
Tuple: (status_message, original_text, translated_text, audio_output_path, processing_details)
"""
try:
if not audio_file:
return "❌ No audio file provided", "", "", None, ""
logger.info(f"Starting processing for: {audio_file} using {asr_model} model")
logger.info(f"Audio file exists: {os.path.exists(audio_file) if audio_file else 'N/A'}")
# Create audio upload DTO
logger.info("Creating AudioUploadDto...")
audio_upload = create_audio_upload_dto(audio_file)
logger.info(f"AudioUploadDto created successfully - Content-Type: {audio_upload.content_type}")
# Get application service from container
container = get_global_container()
audio_service = container.resolve(AudioProcessingApplicationService)
# Create processing request
request = ProcessingRequestDto(
audio=audio_upload,
asr_model=asr_model, # This will use the default from config if None
target_language=target_language,
voice=voice,
speed=speed,
source_language=source_language
)
# Process through application service
result = audio_service.process_audio_pipeline(request)
if result.success:
status_message = f"βœ… Processing Complete! ({result.processing_time:.2f}s)"
logger.info(f"Processing completed successfully in {result.processing_time:.2f}s")
# Prepare processing details
details = {
"processing_time": f"{result.processing_time:.2f}s",
"asr_model": asr_model,
"target_language": target_language,
"voice": voice,
"speed": speed
}
if result.metadata:
details.update(result.metadata)
processing_details = json.dumps(details, indent=2)
return (
status_message,
result.original_text or "",
result.translated_text or "",
result.audio_path if result.has_audio_output else None,
processing_details
)
else:
error_msg = f"❌ Processing Failed: {result.error_message}"
logger.error(f"Processing failed: {result.error_message}")
return error_msg, "", "", None, f"Error: {result.error_message}"
except Exception as e:
logger.error(f"Processing failed: {str(e)}", exc_info=True)
error_msg = f"❌ Processing Failed: {str(e)}"
return error_msg, "", "", None, f"System Error: {str(e)}"
def create_interface():
"""Create and configure the Gradio interface using gr.Interface for better compatibility"""
# Initialize application
initialize_application()
# Get supported configurations
config = get_supported_configurations()
# Log configuration details for debugging
logger.info("=== Gradio Interface Configuration ===")
logger.info(f"Supported voices: {config.get('voices', [])}")
logger.info(f"Supported audio formats: {config.get('audio_formats', [])}")
logger.info(f"Max file size: {config.get('max_file_size_mb', 0)} MB")
logger.info(f"Speed range: {config.get('speed_range', {})}")
logger.info("=== End Configuration ===")
# Language options mapping
language_options = {
"Chinese (Mandarin)": "zh",
"English": "en"
}
def process_wrapper(audio_file, target_lang_val, voice_val, speed_val):
"""Wrapper function for processing"""
# Map display language to code
target_lang_code = language_options.get(target_lang_val, "zh")
# Get default ASR model from configuration
default_asr_model = config.get('default_asr_model', 'whisper')
return process_audio_pipeline(
audio_file=audio_file,
target_language=target_lang_code,
voice=voice_val,
speed=speed_val,
source_language="en"
)
# Create the interface using gr.Interface for better compatibility
logger.info("Creating Gradio interface with updated file type support...")
logger.info("Updated file types for Audio component: .wav, .mp3, .m4a, .flac, .ogg")
interface = gr.Interface(
fn=process_wrapper,
inputs=[
gr.Audio(
label="Upload Audio File",
type="filepath",
# Accept both file extensions and MIME types
# This explicitly allows mp3 files to pass Gradio's frontend validation
),
gr.Dropdown(
choices=list(language_options.keys()),
value="Chinese (Mandarin)",
label="Target Language"
),
gr.Dropdown(
choices=config['voices'],
value="chatterbox",
label="Voice"
),
gr.Slider(
minimum=config['speed_range']['min'],
maximum=config['speed_range']['max'],
value=1.0,
step=0.1,
label="Speech Speed"
)
],
outputs=[
gr.Textbox(label="Status"),
gr.Textbox(label="Recognition Results"),
gr.Textbox(label="Translation Results"),
gr.Audio(label="Audio Output"),
gr.Code(label="Processing Details", language="json")
],
title="🎧 High-Quality Audio Translation System",
description="Upload English Audio β†’ Get Chinese Speech Output",
examples=[
# Add example configurations if needed
]
)
return interface
def main():
"""Main application entry point"""
logger.info("Starting Gradio application")
try:
# Create interface
interface = create_interface()
# Launch the interface
interface.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
debug=False,
show_error=True,
quiet=False
)
except Exception as e:
logger.error(f"Failed to start application: {str(e)}", exc_info=True)
raise
if __name__ == "__main__":
main()