Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running on CPU Upgrade

App Files Files Community

ACloudCenter commited on Sep 12

Commit

0207836

1 Parent(s): 0a19e95

Fix Missing UI comps

Browse files

Files changed (2) hide show

app.py +217 -1
backend_modal/modal_runner.py +54 -1

app.py CHANGED Viewed

@@ -19,6 +19,75 @@ AVAILABLE_VOICES = [
 ]
 DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
 # --- Modal Connection ---
 try:
     # Look up the remote class
@@ -54,7 +123,7 @@ def create_demo_interface():
                 alt="VibeVoice Banner">
         </div>
         """)
-        gr.Markdown("## GPU processing is now offloaded to a Modal.com backend!")
         with gr.Tabs():
             with gr.Tab("Generate"):
@@ -104,6 +173,45 @@ def create_demo_interface():
                             lines=12,
                             max_lines=20,
                         )
                         generate_btn = gr.Button(
                             "🚀 Generate Conference (on Modal)", size="lg",
                             variant="primary",
@@ -116,6 +224,55 @@ def create_demo_interface():
                 def update_speaker_visibility(num_speakers):
                     return [gr.update(visible=(i < num_speakers)) for i in range(4)]
                 num_speakers.change(
                     fn=update_speaker_visibility,
@@ -156,6 +313,65 @@ def create_demo_interface():
                     inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
                     outputs=[complete_audio_output, log_output]
                 )
     return interface
 # --- Main Execution ---

 ]
 DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
+# Male and female voice categories for smart speaker selection
+MALE_VOICES = [
+    "en-Carter_man",
+    "en-Frank_man",
+    "en-Yasser_man",
+    "in-Samuel_man",
+    "zh-Anchen_man_bgm",
+    "zh-Bowen_man"
+]
+FEMALE_VOICES = [
+    "en-Alice_woman_bgm",
+    "en-Alice_woman",
+    "en-Maya_woman",
+    "zh-Xinran_woman"
+]
+# Load example scripts
+def load_example_scripts():
+    examples_dir = "text_examples"
+    example_scripts = []
+    example_scripts_natural = []
+    if not os.path.exists(examples_dir):
+        return example_scripts, example_scripts_natural
+    original_files = [
+        "1p_ai_tedtalk.txt",
+        "1p_politcal_speech.txt",
+        "2p_financeipo_meeting.txt",
+        "2p_telehealth_meeting.txt",
+        "3p_military_meeting.txt",
+        "3p_oil_meeting.txt",
+        "4p_gamecreation_meeting.txt",
+        "4p_product_meeting.txt"
+    ]
+    for txt_file in original_files:
+        file_path = os.path.join(examples_dir, txt_file)
+        natural_file = txt_file.replace(".txt", "_natural.txt")
+        natural_path = os.path.join(examples_dir, natural_file)
+        if os.path.exists(file_path):
+            with open(file_path, 'r', encoding='utf-8') as f:
+                example_scripts.append(f.read())
+        else:
+            example_scripts.append("")
+        if os.path.exists(natural_path):
+            with open(natural_path, 'r', encoding='utf-8') as f:
+                example_scripts_natural.append(f.read())
+        else:
+            example_scripts_natural.append(example_scripts[-1] if example_scripts else "")
+    return example_scripts, example_scripts_natural
+# Gender mapping for each script's speakers
+SCRIPT_SPEAKER_GENDERS = [
+    ["female"],  # AI TED Talk - Rachel
+    ["neutral"],  # Political Speech - generic speaker
+    ["male", "female"],  # Finance IPO - James, Patricia
+    ["female", "male"],  # Telehealth - Jennifer, Tom
+    ["female", "male", "female"],  # Military - Sarah, David, Lisa
+    ["male", "female", "male"],  # Oil - Robert, Lisa, Michael
+    ["male", "female", "male", "male"],  # Game Creation - Alex, Sarah, Marcus, Emma
+    ["female", "male", "female", "male"]  # Product Meeting - Sarah, Marcus, Jennifer, David
+]
+EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
 # --- Modal Connection ---
 try:
     # Look up the remote class
                 alt="VibeVoice Banner">
         </div>
         """)
+        gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5B first.")
         with gr.Tabs():
             with gr.Tab("Generate"):
                             lines=12,
                             max_lines=20,
                         )
+                        with gr.Row():
+                            with gr.Column(scale=1):
+                                gr.Markdown("### Example Scripts")
+                                with gr.Row():
+                                    use_natural = gr.Checkbox(
+                                        value=True,
+                                        label="Natural talking sounds",
+                                        scale=1
+                                    )
+                                    duration_display = gr.Textbox(
+                                        value="",
+                                        label="Est. Duration",
+                                        interactive=False,
+                                        scale=1
+                                    )
+                        example_names = [
+                            "AI TED Talk",
+                            "Political Speech",
+                            "Finance IPO Meeting",
+                            "Telehealth Meeting",
+                            "Military Meeting",
+                            "Oil Meeting",
+                            "Game Creation Meeting",
+                            "Product Meeting"
+                        ]
+                        example_buttons = []
+                        with gr.Row():
+                            for i in range(min(4, len(example_names))):
+                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
+                                example_buttons.append(btn)
+                        with gr.Row():
+                            for i in range(4, min(8, len(example_names))):
+                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
+                                example_buttons.append(btn)
                         generate_btn = gr.Button(
                             "🚀 Generate Conference (on Modal)", size="lg",
                             variant="primary",
                 def update_speaker_visibility(num_speakers):
                     return [gr.update(visible=(i < num_speakers)) for i in range(4)]
+                def smart_speaker_selection(gender_list):
+                    """Select speakers based on gender requirements."""
+                    selected = []
+                    for gender in gender_list:
+                        if gender == "male" and MALE_VOICES:
+                            available = [v for v in MALE_VOICES if v not in selected]
+                            if available:
+                                selected.append(available[0])
+                            else:
+                                selected.append(MALE_VOICES[0])
+                        elif gender == "female" and FEMALE_VOICES:
+                            available = [v for v in FEMALE_VOICES if v not in selected]
+                            if available:
+                                selected.append(available[0])
+                            else:
+                                selected.append(FEMALE_VOICES[0])
+                        else:
+                            # neutral or fallback
+                            available = [v for v in AVAILABLE_VOICES if v not in selected]
+                            if available:
+                                selected.append(available[0])
+                            else:
+                                selected.append(AVAILABLE_VOICES[0])
+                    return selected
+                def load_specific_example(idx, natural):
+                    """Load a specific example script."""
+                    if idx >= len(EXAMPLE_SCRIPTS):
+                        return [2, ""] + [None, None, None, None]
+                    script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
+                    genders = SCRIPT_SPEAKER_GENDERS[idx] if idx < len(SCRIPT_SPEAKER_GENDERS) else ["neutral"]
+                    speakers = smart_speaker_selection(genders)
+                    # Pad speakers to 4
+                    while len(speakers) < 4:
+                        speakers.append(None)
+                    return [len(genders), script] + speakers[:4]
+                # Connect example buttons
+                for idx, btn in enumerate(example_buttons):
+                    btn.click(
+                        fn=lambda nat, i=idx: load_specific_example(i, nat),
+                        inputs=[use_natural],
+                        outputs=[num_speakers, script_input] + speaker_selections,
+                        queue=False
+                    )
                 num_speakers.change(
                     fn=update_speaker_visibility,
                     inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
                     outputs=[complete_audio_output, log_output]
                 )
+            with gr.Tab("Architecture"):
+                with gr.Row():
+                    gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
+                    such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly
+                    in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous
+                    speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently
+                    preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice
+                    employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and
+                    dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to
+                    90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.""")
+                with gr.Row():
+                    with gr.Column():
+                        gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
+                        gr.Markdown("""
+                        ### Overview
+                        VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
+                        such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems,
+                        particularly in scalability, speaker consistency, and natural turn-taking.
+                        ### Key Features
+                        - **Multi-Speaker Support**: Handles up to 4 distinct speakers
+                        - **Long-Form Generation**: Synthesizes speech up to 90 minutes
+                        - **Natural Conversation Flow**: Includes turn-taking and interruptions
+                        - **Ultra-Low Frame Rate**: 7.5 Hz tokenizers for efficiency
+                        - **High Fidelity**: Preserves acoustic details while being computationally efficient
+                        ### Technical Architecture
+                        1. **Continuous Speech Tokenizers**: Acoustic and Semantic tokenizers at 7.5 Hz
+                        2. **Next-Token Diffusion Framework**: Combines LLM understanding with diffusion generation
+                        3. **Large Language Model**: Understands context and dialogue flow
+                        4. **Diffusion Head**: Generates high-fidelity acoustic details
+                        """)
+                    with gr.Column():
+                        gr.HTML("""
+                        <div style="width: 100%; padding: 20px;">
+                            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.jpg"
+                                style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 5px 20px rgba(0,0,0,0.15);"
+                                alt="VibeVoice Architecture Diagram">
+                        </div>
+                        """)
+                        gr.Markdown("""
+                        ### Model Variants
+                        **VibeVoice-1.5B**: Faster inference, suitable for real-time applications
+                        **VibeVoice-7B**: Higher quality output, recommended for production use
+                        ### Performance Metrics
+                        <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png"
+                            style="width: 100%; height: auto; border-radius: 10px; margin-top: 20px;"
+                            alt="Performance Comparison">
+                        """)
     return interface
 # --- Main Execution ---

backend_modal/modal_runner.py CHANGED Viewed

@@ -29,7 +29,7 @@ image = (
     .add_local_dir("backend_modal/modular", remote_path="/root/modular")
     .add_local_dir("backend_modal/processor", remote_path="/root/processor")
     .add_local_dir("backend_modal/voices", remote_path="/root/voices")
-    .add_local_dir("./text_examples", remote_path="/root/text_examples")
     .add_local_dir("backend_modal/schedule", remote_path="/root/schedule")
 )
@@ -117,6 +117,59 @@ class VibeVoiceModel:
             print(f"Error reading audio {audio_path}: {e}")
             return np.array([])
     @modal.method()
     def generate_podcast(self,
                          num_speakers: int,

     .add_local_dir("backend_modal/modular", remote_path="/root/modular")
     .add_local_dir("backend_modal/processor", remote_path="/root/processor")
     .add_local_dir("backend_modal/voices", remote_path="/root/voices")
+    .add_local_dir("text_examples", remote_path="/root/text_examples")
     .add_local_dir("backend_modal/schedule", remote_path="/root/schedule")
 )
             print(f"Error reading audio {audio_path}: {e}")
             return np.array([])
+    @staticmethod
+    def _infer_num_speakers_from_script(script: str) -> int:
+        """
+        Infer number of speakers by counting distinct 'Speaker X:' tags in the script.
+        Robust to 0- or 1-indexed labels and repeated turns.
+        Falls back to 1 if none found.
+        """
+        import re
+        ids = re.findall(r'(?mi)^\s*Speaker\s+(\d+)\s*:', script)
+        return len({int(x) for x in ids}) if ids else 1
+    @modal.method()
+    def get_example_scripts(self):
+        examples_dir = "/root/text_examples"
+        example_scripts = []
+        example_scripts_natural = []
+        if not os.path.exists(examples_dir):
+            return [], []
+        original_files = [
+            "1p_ai_tedtalk.txt",
+            "1p_politcal_speech.txt",
+            "2p_financeipo_meeting.txt",
+            "2p_telehealth_meeting.txt",
+            "3p_military_meeting.txt",
+            "3p_oil_meeting.txt",
+            "4p_gamecreation_meeting.txt",
+            "4p_product_meeting.txt"
+        ]
+        for txt_file in original_files:
+            try:
+                with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
+                    script_content = f.read().strip()
+                if script_content:
+                    num_speakers = self._infer_num_speakers_from_script(script_content)
+                    example_scripts.append([num_speakers, script_content])
+                natural_file = txt_file.replace('.txt', '_natural.txt')
+                natural_path = os.path.join(examples_dir, natural_file)
+                if os.path.exists(natural_path):
+                    with open(natural_path, 'r', encoding='utf-8') as f:
+                        natural_content = f.read().strip()
+                    if natural_content:
+                        num_speakers = self._infer_num_speakers_from_script(natural_content)
+                        example_scripts_natural.append([num_speakers, natural_content])
+                else:
+                    example_scripts_natural.append([num_speakers, script_content])
+            except Exception as e:
+                print(f"Error loading {txt_file}: {e}")
+        return example_scripts, example_scripts_natural
     @modal.method()
     def generate_podcast(self,
                          num_speakers: int,