File size: 21,930 Bytes
b15cc82
 
 
 
 
 
 
 
f088ed2
 
b15cc82
 
 
 
 
 
 
 
 
 
 
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
f088ed2
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0207836
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
d5cf69f
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
0207836
34c49bb
 
 
 
 
 
 
 
 
 
 
 
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34c49bb
0207836
 
 
 
34c49bb
0207836
 
 
 
 
34c49bb
0207836
 
 
 
 
 
34c49bb
0207836
 
34c49bb
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
d5cf69f
 
 
 
b15cc82
d5cf69f
 
 
 
 
 
b15cc82
 
 
 
d5cf69f
 
5dc3e05
 
b15cc82
 
 
 
 
 
 
 
5dc3e05
d5cf69f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
d5cf69f
 
 
 
 
 
 
b15cc82
 
 
 
d5cf69f
b15cc82
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cf69f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
import os
import gradio as gr
import modal
import traceback

# --- Configuration ---
# This is the name of your Modal stub.
MODAL_STUB_NAME = "vibevoice-generator"
MODAL_CLASS_NAME = "VibeVoiceModel" # Extract class name
MODAL_METHOD_NAME = "generate_podcast" # Extract method name

# These lists are now hardcoded because the data lives on the Modal container.
# For a more dynamic app, you could create a small Modal function to fetch these lists.
AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
AVAILABLE_VOICES = [
    "en-Alice_woman_bgm", "en-Alice_woman", "en-Carter_man", "en-Frank_man",
    "en-Maya_woman", "en-Yasser_man", "in-Samuel_man", "zh-Anchen_man_bgm",
    "zh-Bowen_man", "zh-Xinran_woman"
]
DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']

# Male and female voice categories for smart speaker selection
MALE_VOICES = [
    "en-Carter_man",
    "en-Frank_man",
    "en-Yasser_man",
    "in-Samuel_man",
    "zh-Anchen_man_bgm",
    "zh-Bowen_man"
]
FEMALE_VOICES = [
    "en-Alice_woman_bgm",
    "en-Alice_woman",
    "en-Maya_woman",
    "zh-Xinran_woman"
]

# Load example scripts
def load_example_scripts():
    examples_dir = "text_examples"
    example_scripts = []
    example_scripts_natural = []
    
    if not os.path.exists(examples_dir):
        return example_scripts, example_scripts_natural
    
    original_files = [
        "1p_ai_tedtalk.txt",
        "1p_politcal_speech.txt",
        "2p_financeipo_meeting.txt",
        "2p_telehealth_meeting.txt",
        "3p_military_meeting.txt",
        "3p_oil_meeting.txt",
        "4p_gamecreation_meeting.txt",
        "4p_product_meeting.txt"
    ]
    
    for txt_file in original_files:
        file_path = os.path.join(examples_dir, txt_file)
        natural_file = txt_file.replace(".txt", "_natural.txt")
        natural_path = os.path.join(examples_dir, natural_file)
        
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                example_scripts.append(f.read())
        else:
            example_scripts.append("")
            
        if os.path.exists(natural_path):
            with open(natural_path, 'r', encoding='utf-8') as f:
                example_scripts_natural.append(f.read())
        else:
            example_scripts_natural.append(example_scripts[-1] if example_scripts else "")
    
    return example_scripts, example_scripts_natural

# Gender mapping for each script's speakers
SCRIPT_SPEAKER_GENDERS = [
    ["female"],  # AI TED Talk - Rachel
    ["neutral"],  # Political Speech - generic speaker
    ["male", "female"],  # Finance IPO - James, Patricia
    ["female", "male"],  # Telehealth - Jennifer, Tom
    ["female", "male", "female"],  # Military - Sarah, David, Lisa
    ["male", "female", "male"],  # Oil - Robert, Lisa, Michael
    ["male", "female", "male", "male"],  # Game Creation - Alex, Sarah, Marcus, Emma
    ["female", "male", "female", "male"]  # Product Meeting - Sarah, Marcus, Jennifer, David
]

EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()

# --- Modal Connection ---
try:
    # Look up the remote class
    RemoteVibeVoiceModel = modal.Cls.from_name(MODAL_STUB_NAME, MODAL_CLASS_NAME)
    # Create an instance of the remote class
    remote_model_instance = RemoteVibeVoiceModel()
    # Get the remote method
    remote_generate_function = remote_model_instance.generate_podcast
    print("Successfully connected to Modal function.")
except modal.exception.NotFoundError:
    print("ERROR: Modal function not found.")
    print(f"Please deploy the Modal app first by running: modal deploy modal_runner.py")
    remote_generate_function = None

# --- Gradio UI Definition ---
theme = gr.themes.Ocean(
    primary_hue="indigo",
    secondary_hue="fuchsia",
    neutral_hue="slate",
).set(
    button_large_radius='*radius_sm'
)

def create_demo_interface():
    with gr.Blocks(
        title="VibeVoice - Conference Generator",
        theme=theme,
    ) as interface:
        gr.HTML("""
        <div style="width: 100%; margin-bottom: 20px;">
            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png" 
                style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
                alt="VibeVoice Banner">
        </div>
        """)
        gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5B first.")
        
        with gr.Tabs():
            with gr.Tab("Generate"):
                gr.Markdown("### Generated Conference")
                complete_audio_output = gr.Audio(
                    label="Complete Conference (Download)",
                    type="numpy",
                    autoplay=False,
                    show_download_button=True,
                )
                
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### Conference Settings")
                        model_dropdown = gr.Dropdown(
                            choices=AVAILABLE_MODELS,
                            value=AVAILABLE_MODELS[0],
                            label="Model",
                        )
                        num_speakers = gr.Slider(
                            minimum=1, maximum=4, value=2, step=1,
                            label="Number of Speakers",
                        )

                        gr.Markdown("### Speaker Selection")
                        speaker_selections = []
                        for i in range(4):
                            speaker = gr.Dropdown(
                                choices=AVAILABLE_VOICES,
                                value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
                                label=f"Speaker {i+1}",
                                visible=(i < 2),
                            )
                            speaker_selections.append(speaker)

                        with gr.Accordion("Advanced Settings", open=False):
                            cfg_scale = gr.Slider(
                                minimum=1.0, maximum=2.0, value=1.3, step=0.05,
                                label="CFG Scale (Guidance Strength)",
                            )

                    with gr.Column(scale=2):
                        gr.Markdown("### Script Input")
                        script_input = gr.Textbox(
                            label="Conversation Script",
                            placeholder="Enter your conference script here...",
                            lines=12,
                            max_lines=20,
                        )
                        
                        with gr.Row():
                            with gr.Column(scale=1):
                                gr.Markdown("### Example Scripts")
                                with gr.Row():
                                    use_natural = gr.Checkbox(
                                        value=True,
                                        label="Natural talking sounds",
                                        scale=1
                                    )
                                    duration_display = gr.Textbox(
                                        value="",
                                        label="Est. Duration",
                                        interactive=False,
                                        scale=1
                                    )
                        
                        example_names = [
                            "AI TED Talk",
                            "Political Speech",
                            "Finance IPO Meeting",
                            "Telehealth Meeting",
                            "Military Meeting",
                            "Oil Meeting",
                            "Game Creation Meeting",
                            "Product Meeting"
                        ]
                        
                        example_buttons = []
                        with gr.Row():
                            for i in range(min(4, len(example_names))):
                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
                                example_buttons.append(btn)
                        
                        with gr.Row():
                            for i in range(4, min(8, len(example_names))):
                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
                                example_buttons.append(btn)
                        
                        generate_btn = gr.Button(
                            "🚀 Generate Conference (on Modal)", size="lg",
                            variant="primary",
                        )
                        log_output = gr.Textbox(
                            label="Generation Log",
                            lines=8, max_lines=15,
                            interactive=False,
                        )
                        with gr.Row():
                            status_display = gr.Markdown(
                                value="Status: idle.",
                                elem_id="status-display",
                            )
                            progress_slider = gr.Slider(
                                minimum=0,
                                maximum=100,
                                value=0,
                                step=1,
                                label="Progress",
                                interactive=False,
                            )

                def update_speaker_visibility(num_speakers):
                    return [gr.update(visible=(i < num_speakers)) for i in range(4)]
                
                def estimate_duration(script):
                    """Estimate duration based on word count."""
                    if not script:
                        return ""
                    words = len(script.split())
                    # Approximate 150 words per minute for natural speech
                    minutes = words / 150
                    if minutes < 1:
                        return f"~{int(minutes * 60)} seconds"
                    else:
                        return f"~{minutes:.1f} minutes"
                
                def smart_speaker_selection(gender_list):
                    """Select speakers based on gender requirements."""
                    selected = []
                    for gender in gender_list:
                        if gender == "male" and MALE_VOICES:
                            available = [v for v in MALE_VOICES if v not in selected]
                            if available:
                                selected.append(available[0])
                            else:
                                selected.append(MALE_VOICES[0])
                        elif gender == "female" and FEMALE_VOICES:
                            available = [v for v in FEMALE_VOICES if v not in selected]
                            if available:
                                selected.append(available[0])
                            else:
                                selected.append(FEMALE_VOICES[0])
                        else:
                            # neutral or fallback
                            available = [v for v in AVAILABLE_VOICES if v not in selected]
                            if available:
                                selected.append(available[0])
                            else:
                                selected.append(AVAILABLE_VOICES[0])
                    return selected
                
                def load_specific_example(idx, natural):
                    """Load a specific example script."""
                    if idx >= len(EXAMPLE_SCRIPTS):
                        return [2, "", ""] + [None, None, None, None]
                    
                    script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
                    genders = SCRIPT_SPEAKER_GENDERS[idx] if idx < len(SCRIPT_SPEAKER_GENDERS) else ["neutral"]
                    speakers = smart_speaker_selection(genders)
                    duration = estimate_duration(script)
                    
                    # Pad speakers to 4
                    while len(speakers) < 4:
                        speakers.append(None)
                    
                    return [len(genders), script, duration] + speakers[:4]
                
                # Connect example buttons
                for idx, btn in enumerate(example_buttons):
                    btn.click(
                        fn=lambda nat, i=idx: load_specific_example(i, nat),
                        inputs=[use_natural],
                        outputs=[num_speakers, script_input, duration_display] + speaker_selections,
                        queue=False
                    )
                
                # Update duration when script changes
                script_input.change(
                    fn=estimate_duration,
                    inputs=[script_input],
                    outputs=[duration_display],
                    queue=False
                )

                num_speakers.change(
                    fn=update_speaker_visibility,
                    inputs=[num_speakers],
                    outputs=speaker_selections
                )

                def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                    if remote_generate_function is None:
                        error_message = "ERROR: Modal function not deployed. Please contact the space owner."
                        yield None, error_message, "Status: error.", gr.update(value=0)
                        return

                    # Show a message that we are calling the remote function
                    yield (
                        None,
                        "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
                        "**Connecting**\nRequesting GPU resources…",
                        gr.update(value=0),
                    )

                    try:
                        speakers = speakers_and_params[:4]
                        cfg_scale_val = speakers_and_params[4]
                        current_log = ""

                        # Stream updates from the Modal function
                        for update in remote_generate_function.remote_gen(
                            num_speakers=int(num_speakers_val),
                            script=script,
                            speaker_1=speakers[0],
                            speaker_2=speakers[1],
                            speaker_3=speakers[2],
                            speaker_4=speakers[3],
                            cfg_scale=cfg_scale_val,
                            model_name=model_choice
                        ):
                            if not update:
                                continue

                            audio_payload = update.get("audio")
                            progress_pct = update.get("pct", 0)
                            stage_label = update.get("stage", "").replace("_", " ").title() or "Status"
                            status_line = update.get("status") or "Processing…"
                            current_log = update.get("log", current_log)

                            status_formatted = f"**{stage_label}**\n{status_line}"
                            audio_output = audio_payload if audio_payload is not None else gr.update()

                            yield (
                                audio_output,
                                current_log,
                                status_formatted,
                                gr.update(value=progress_pct),
                            )
                    except Exception as e:
                        tb = traceback.format_exc()
                        print(f"Error calling Modal: {e}")
                        error_log = f"❌ An error occurred: {e}\n\n{tb}"
                        yield (
                            None,
                            error_log,
                            "**Error**\nInference failed.",
                            gr.update(value=0),
                        )

                generate_btn.click(
                    fn=generate_podcast_wrapper,
                    inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
                    outputs=[complete_audio_output, log_output, status_display, progress_slider]
                )
            
            with gr.Tab("Architecture"):
                with gr.Row():
                    gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, 
                    such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly 
                    in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous 
                    speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently 
                    preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice 
                    employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and 
                    dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to 
                    90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.""")
                with gr.Row():    
                    with gr.Column():
                        gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")

                        gr.Markdown("""
                        ### Overview
                        
                        VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, 
                        such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, 
                        particularly in scalability, speaker consistency, and natural turn-taking.
                        
                        ### Key Features
                        
                        - **Multi-Speaker Support**: Handles up to 4 distinct speakers
                        - **Long-Form Generation**: Synthesizes speech up to 90 minutes
                        - **Natural Conversation Flow**: Includes turn-taking and interruptions
                        - **Ultra-Low Frame Rate**: 7.5 Hz tokenizers for efficiency
                        - **High Fidelity**: Preserves acoustic details while being computationally efficient
                        
                        ### Technical Architecture
                        
                        1. **Continuous Speech Tokenizers**: Acoustic and Semantic tokenizers at 7.5 Hz
                        2. **Next-Token Diffusion Framework**: Combines LLM understanding with diffusion generation
                        3. **Large Language Model**: Understands context and dialogue flow
                        4. **Diffusion Head**: Generates high-fidelity acoustic details
                        """)
                        
                    with gr.Column():
                        gr.HTML("""
                        <div style="width: 100%; padding: 20px;">
                            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.jpg" 
                                style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 5px 20px rgba(0,0,0,0.15);"
                                alt="VibeVoice Architecture Diagram">
                        </div>
                        """)
                        
                        gr.Markdown("""
                        ### Model Variants
                        
                        **VibeVoice-1.5B**: Faster inference, suitable for real-time applications
                        **VibeVoice-7B**: Higher quality output, recommended for production use
                        
                        ### Performance Metrics
                        
                        <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png" 
                            style="width: 100%; height: auto; border-radius: 10px; margin-top: 20px;"
                            alt="Performance Comparison">
                        """)
    return interface

# --- Main Execution ---
if __name__ == "__main__":
    if remote_generate_function is None:
        # If Modal isn't set up, we can't launch the full app.
        # We'll show a simplified UI with an error message.
        with gr.Blocks(theme=theme) as interface:
            gr.Markdown("# ❌ Configuration Error")
            gr.Markdown(
                "The Gradio application cannot connect to the Modal backend. "
                "The Modal app has not been deployed yet. "
                "Please run `modal deploy modal_runner.py` in your terminal and then refresh this page."
            )
        interface.launch()
    else:
        # Launch the full Gradio interface
        interface = create_demo_interface()
        interface.queue().launch(show_error=True)