Spaces:

DroolingPanda
/

teachingAssistant

Sleeping

App Files Files Community

Michael Hu commited on Jan 25

Commit

cd1309d

1 Parent(s): 71acd53

initial check in

Browse files

Files changed (6) hide show

LICENSE +21 -0
app.py +118 -0
pyproject.toml +60 -0
utils/stt.py +51 -0
utils/translation.py +45 -0
utils/tts.py +46 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Michael
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+Main entry point for the Audio Translation Web Application
+Handles file upload, processing pipeline, and UI rendering
+"""
+import streamlit as st
+import os
+import time
+from dotenv import load_dotenv
+from utils.stt import transcribe_audio
+from utils.translation import translate_text
+from utils.tts import generate_speech
+# Initialize environment configurations
+load_dotenv()
+os.makedirs("temp/uploads", exist_ok=True)
+os.makedirs("temp/outputs", exist_ok=True)
+def configure_page():
+    """Set up Streamlit page configuration"""
+    st.set_page_config(
+        page_title="Audio Translator",
+        page_icon="🎧",
+        layout="wide",
+        initial_sidebar_state="expanded"
+    )
+    st.markdown("""
+        <style>
+            .reportview-container {margin-top: -2em;}
+            #MainMenu {visibility: hidden;}
+            .stDeployButton {display:none;}
+        </style>
+    """, unsafe_allow_html=True)
+def handle_file_processing(upload_path):
+    """
+    Execute the complete processing pipeline:
+    1. Speech-to-Text (STT)
+    2. Machine Translation
+    3. Text-to-Speech (TTS)
+    """
+    progress_bar = st.progress(0)
+    status_text = st.empty()
+    try:
+        # STT Phase
+        status_text.markdown("🔍 **Performing Speech Recognition...**")
+        english_text = transcribe_audio(upload_path)
+        progress_bar.progress(30)
+        # Translation Phase
+        status_text.markdown("🌐 **Translating Content...**")
+        chinese_text = translate_text(english_text)
+        progress_bar.progress(60)
+        # TTS Phase
+        status_text.markdown("🎵 **Generating Chinese Speech...**")
+        output_path = generate_speech(chinese_text)
+        progress_bar.progress(100)
+        # Display results
+        status_text.success("✅ Processing Complete!")
+        return english_text, chinese_text, output_path
+    except Exception as e:
+        status_text.error(f"❌ Processing Failed: {str(e)}")
+        st.exception(e)
+        raise
+def render_results(english_text, chinese_text, output_path):
+    """Display processing results in organized columns"""
+    st.divider()
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        st.subheader("Recognition Results")
+        st.code(english_text, language="text")
+        st.subheader("Translation Results")
+        st.code(chinese_text, language="text")
+    with col2:
+        st.subheader("Audio Output")
+        st.audio(output_path)
+        with open(output_path, "rb") as f:
+            st.download_button(
+                label="Download Audio",
+                data=f,
+                file_name="translated_audio.wav",
+                mime="audio/wav"
+            )
+def main():
+    """Main application workflow"""
+    configure_page()
+    st.title("🎧 High-Quality Audio Translation System")
+    st.markdown("Upload English Audio → Get Chinese Speech Output")
+    # File uploader widget
+    uploaded_file = st.file_uploader(
+        "Select Audio File (MP3/WAV)",
+        type=["mp3", "wav"],
+        accept_multiple_files=False
+    )
+    if uploaded_file:
+        # Save uploaded file
+        upload_path = os.path.join("temp/uploads", uploaded_file.name)
+        with open(upload_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        # Execute processing pipeline
+        results = handle_file_processing(upload_path)
+        if results:
+            render_results(*results)
+if __name__ == "__main__":
+    main()

pyproject.toml ADDED Viewed

	@@ -0,0 +1,60 @@

+[tool.poetry]
+name = "teaching-assistant"
+version = "0.1.0"
+description = "High-quality audio translation web application"
+authors = ["Your Name <[email protected]>"]
+license = "MIT"
+keywords = ["nlp", "translation", "speech-processing"]
+[tool.poetry.dependencies]
+python = "^3.9"
+# Core dependencies
+streamlit = "^1.31.1"
+pydub = "^0.25.1"
+python-dotenv = "^1.0.0"
+nltk = "^3.8.1"          # Text segmentation
+librosa = "^0.10.1"      # Advanced audio processing
+soundfile = "^0.12.1"    # Audio file I/O
+ffmpeg-python = "^0.2.0" # FFmpeg integration
+# Machine learning frameworks
+torch = { version = "^2.2.1", source = "pytorch" }
+transformers = { version = "^4.38.2", extras = ["audio"] }
+# Text-to-speech engine
+TTS = "^0.21.0"
+# Platform-specific dependencies
+torchaudio = { version = "^2.2.1", source = "pytorch", optional = true }
+[tool.poetry.group.dev.dependencies]
+black = "^24.3.0"
+flake8 = "^6.1.0"
+mypy = "^1.8.0"
+pytest = "^8.0.2"
+[[tool.poetry.source]]
+name = "pytorch"
+url = "https://download.pytorch.org/whl/cpu"
+priority = "primary"
+[build-system]
+requires = ["poetry-core>=1.3.2"]
+build-backend = "poetry.core.masonry.api"
+[tool.poetry.extras]
+gpu = ["torchaudio"]
+[tool.poetry.scripts]
+start = "app:main"
+[project.urls]
+Documentation = "https://github.com/yourusername/audio-translator/wiki"
+Issue-Tracker = "https://github.com/yourusername/audio-translator/issues"
+# Configuration notes:
+# 1. Torch dependencies are sourced from PyTorch's official repository
+# 2. Transformers include audio processing extras
+# 3. GPU support can be enabled via: poetry install --extras "gpu"
+# 4. Platform-specific dependencies are handled through optional groups

utils/stt.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Speech Recognition Module using Whisper Large-v3
+Handles audio preprocessing and transcription
+"""
+import torch
+from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor
+from pydub import AudioSegment
+def transcribe_audio(audio_path):
+    """
+    Convert audio file to text using Whisper ASR model
+    Args:
+        audio_path: Path to input audio file
+    Returns:
+        Transcribed English text
+    """
+    # Configure hardware settings
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Convert to proper audio format
+    audio = AudioSegment.from_file(audio_path)
+    processed_audio = audio.set_frame_rate(16000).set_channels(1)
+    wav_path = audio_path.replace(".mp3", ".wav")
+    processed_audio.export(wav_path, format="wav")
+    # Initialize ASR model
+    model = AutoModelForSpeechSeq2Seq.from_pretrained(
+        "openai/whisper-large-v3",
+        torch_dtype=torch.float32,
+        low_cpu_mem_usage=True,
+        use_safetensors=True
+    ).to(device)
+    processor = AutoProcessor.from_pretrained("openai/whisper-large-v3")
+    # Process audio input
+    inputs = processor(
+        wav_path,
+        sampling_rate=16000,
+        return_tensors="pt",
+        truncation=True,
+        chunk_length_s=30,
+        stride_length_s=5
+    ).to(device)
+    # Generate transcription
+    with torch.no_grad():
+        outputs = model.generate(**inputs, language="en", task="transcribe")
+    return processor.batch_decode(outputs, skip_special_tokens=True)[0]

utils/translation.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+Text Translation Module using NLLB-3.3B model
+Handles text segmentation and batch translation
+"""
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+def translate_text(text):
+    """
+    Translate English text to Simplified Chinese
+    Args:
+        text: Input English text
+    Returns:
+        Translated Chinese text
+    """
+    # Initialize translation model
+    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-3.3B")
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-3.3B")
+    # Split long text into manageable chunks
+    max_chunk_length = 1000
+    text_chunks = [
+        text[i:i+max_chunk_length]
+        for i in range(0, len(text), max_chunk_length)
+    ]
+    translated_chunks = []
+    for chunk in text_chunks:
+        # Prepare model inputs
+        inputs = tokenizer(
+            chunk,
+            return_tensors="pt",
+            max_length=1024,
+            truncation=True
+        )
+        # Generate translation
+        outputs = model.generate(
+            **inputs,
+            forced_bos_token_id=tokenizer.lang_code_to_id["zho_Hans"],
+            max_new_tokens=1024
+        )
+        translated_chunks.append(tokenizer.decode(outputs[0], skip_special_tokens=True))
+    return "".join(translated_chunks)

utils/tts.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""
+Text-to-Speech Module using YourTTS
+Handles speech synthesis and output generation
+"""
+from TTS.api import TTS
+import os
+import time
+def generate_speech(text):
+    """
+    Convert Chinese text to natural-sounding speech
+    Args:
+        text: Input Chinese text
+    Returns:
+        Path to generated audio file
+    """
+    # Initialize TTS engine
+    tts = TTS(
+        model_name="tts_models/multilingual/multi-dataset/your_tts",
+        progress_bar=False,
+        gpu=False
+    )
+    # Create unique output filename
+    output_path = os.path.join(
+        "temp/outputs",
+        f"output_{int(time.time())}.wav"
+    )
+    # Use reference voice if available
+    ref_voice = (
+        "assets/reference_voice.wav"
+        if os.path.exists("assets/reference_voice.wav")
+        else None
+    )
+    # Generate speech output
+    tts.tts_to_file(
+        text=text,
+        speaker_wav=ref_voice,
+        language="zh-cn",
+        file_path=output_path
+    )
+    return output_path