File size: 28,873 Bytes
b15cc82
 
 
 
 
 
 
 
f088ed2
 
b15cc82
 
 
 
 
 
 
 
 
 
 
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
f088ed2
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
 
c7cf6c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
0207836
b15cc82
 
 
 
c7cf6c1
 
 
 
b15cc82
c7cf6c1
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
d5cf69f
 
c7cf6c1
d5cf69f
 
 
 
 
 
 
 
 
 
b15cc82
 
 
0207836
34c49bb
 
 
 
 
 
 
 
 
 
 
 
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34c49bb
0207836
 
 
 
34c49bb
0207836
 
 
 
 
34c49bb
0207836
 
 
 
 
 
34c49bb
0207836
 
34c49bb
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
d5cf69f
c7cf6c1
 
 
 
 
 
 
 
d5cf69f
 
c7cf6c1
 
 
 
d5cf69f
c7cf6c1
d5cf69f
c7cf6c1
 
 
d5cf69f
b15cc82
 
 
 
d5cf69f
c7cf6c1
 
 
 
 
d5cf69f
5dc3e05
 
b15cc82
 
 
 
 
 
 
 
5dc3e05
d5cf69f
 
 
9ae9cad
 
 
c7cf6c1
 
9ae9cad
d5cf69f
c7cf6c1
9ae9cad
c7cf6c1
d5cf69f
c7cf6c1
 
 
 
 
 
 
 
 
 
 
 
 
9ae9cad
 
c7cf6c1
9ae9cad
 
c7cf6c1
 
9ae9cad
c7cf6c1
 
 
 
 
 
9ae9cad
 
 
c7cf6c1
9ae9cad
 
c7cf6c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9ae9cad
c7cf6c1
9ae9cad
c7cf6c1
 
 
9ae9cad
b15cc82
 
 
d5cf69f
c7cf6c1
d5cf69f
c7cf6c1
d5cf69f
 
 
c7cf6c1
d5cf69f
b15cc82
 
 
 
c7cf6c1
b15cc82
0207836
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b15cc82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d5cf69f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
import os
import gradio as gr
import modal
import traceback

# --- Configuration ---
# This is the name of your Modal stub.
MODAL_STUB_NAME = "vibevoice-generator"
MODAL_CLASS_NAME = "VibeVoiceModel" # Extract class name
MODAL_METHOD_NAME = "generate_podcast" # Extract method name

# These lists are now hardcoded because the data lives on the Modal container.
# For a more dynamic app, you could create a small Modal function to fetch these lists.
AVAILABLE_MODELS = ["VibeVoice-1.5B", "VibeVoice-7B"]
AVAILABLE_VOICES = [
    "en-Alice_woman_bgm", "en-Alice_woman", "en-Carter_man", "en-Frank_man",
    "en-Maya_woman", "en-Yasser_man", "in-Samuel_man", "zh-Anchen_man_bgm",
    "zh-Bowen_man", "zh-Xinran_woman"
]
DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']

# Male and female voice categories for smart speaker selection
MALE_VOICES = [
    "en-Carter_man",
    "en-Frank_man",
    "en-Yasser_man",
    "in-Samuel_man",
    "zh-Anchen_man_bgm",
    "zh-Bowen_man"
]
FEMALE_VOICES = [
    "en-Alice_woman_bgm",
    "en-Alice_woman",
    "en-Maya_woman",
    "zh-Xinran_woman"
]

# Load example scripts
def load_example_scripts():
    examples_dir = "text_examples"
    example_scripts = []
    example_scripts_natural = []
    
    if not os.path.exists(examples_dir):
        return example_scripts, example_scripts_natural
    
    original_files = [
        "1p_ai_tedtalk.txt",
        "1p_politcal_speech.txt",
        "2p_financeipo_meeting.txt",
        "2p_telehealth_meeting.txt",
        "3p_military_meeting.txt",
        "3p_oil_meeting.txt",
        "4p_gamecreation_meeting.txt",
        "4p_product_meeting.txt"
    ]
    
    for txt_file in original_files:
        file_path = os.path.join(examples_dir, txt_file)
        natural_file = txt_file.replace(".txt", "_natural.txt")
        natural_path = os.path.join(examples_dir, natural_file)
        
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                example_scripts.append(f.read())
        else:
            example_scripts.append("")
            
        if os.path.exists(natural_path):
            with open(natural_path, 'r', encoding='utf-8') as f:
                example_scripts_natural.append(f.read())
        else:
            example_scripts_natural.append(example_scripts[-1] if example_scripts else "")
    
    return example_scripts, example_scripts_natural

# Gender mapping for each script's speakers
SCRIPT_SPEAKER_GENDERS = [
    ["female"],  # AI TED Talk - Rachel
    ["neutral"],  # Political Speech - generic speaker
    ["male", "female"],  # Finance IPO - James, Patricia
    ["female", "male"],  # Telehealth - Jennifer, Tom
    ["female", "male", "female"],  # Military - Sarah, David, Lisa
    ["male", "female", "male"],  # Oil - Robert, Lisa, Michael
    ["male", "female", "male", "male"],  # Game Creation - Alex, Sarah, Marcus, Emma
    ["female", "male", "female", "male"]  # Product Meeting - Sarah, Marcus, Jennifer, David
]

EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()

# --- Modal Connection ---
try:
    # Look up the remote class
    RemoteVibeVoiceModel = modal.Cls.from_name(MODAL_STUB_NAME, MODAL_CLASS_NAME)
    # Create an instance of the remote class
    remote_model_instance = RemoteVibeVoiceModel()
    # Get the remote method
    remote_generate_function = remote_model_instance.generate_podcast
    print("Successfully connected to Modal function.")
except modal.exception.NotFoundError:
    print("ERROR: Modal function not found.")
    print(f"Please deploy the Modal app first by running: modal deploy modal_runner.py")
    remote_generate_function = None

# --- Gradio UI Definition ---
theme = gr.themes.Ocean(
    primary_hue="indigo",
    secondary_hue="fuchsia",
    neutral_hue="slate",
).set(
    button_large_radius='*radius_sm'
)
 
AUDIO_LABEL_DEFAULT = "Complete Conference (Download)"
PRIMARY_STAGE_MESSAGES = {
    "connecting": ("🚀 Request Submitted", "Provisioning GPU resources... cold starts can take up to a minute."),
    "queued": ("🚦 Waiting For GPU", "Worker is spinning up. Cold starts may take 30-60 seconds."),
    "loading_model": ("📦 Loading Model", "Streaming VibeVoice weights to the GPU."),
    "loading_voices": ("🎙️ Loading Voices", None),
    "preparing_inputs": ("📝 Preparing Script", "Formatting the conversation for the model."),
    "generating_audio": ("🎧 Generating Audio", "Synthesizing speech — this is the longest step."),
    "processing_audio": ("✨ Finalizing Audio", "Converting tensors into a playable waveform."),
    "complete": ("✅ Ready", "Press play below or download your conference."),
    "error": ("❌ Error", "Check the log for details."),
}
AUDIO_STAGE_LABELS = {
    "connecting": "Complete Conference (requesting GPU...)",
    "queued": "Complete Conference (GPU warming up...)",
    "loading_model": "Complete Conference (loading model...)",
    "loading_voices": "Complete Conference (loading voices...)",
    "preparing_inputs": "Complete Conference (preparing inputs...)",
    "generating_audio": "Complete Conference (generating audio...)",
    "processing_audio": "Complete Conference (finalizing audio...)",
    "error": "Complete Conference (error)",
}
READY_PRIMARY_STATUS = "### Ready\nPress **Generate** to run VibeVoice."


def build_primary_status(stage: str, status_line: str) -> str:
    title, default_desc = PRIMARY_STAGE_MESSAGES.get(stage, ("⚙️ Working", "Processing..."))
    desc_parts = []
    if default_desc:
        desc_parts.append(default_desc)
    if status_line and status_line not in desc_parts:
        desc_parts.append(status_line)
    desc = "\n\n".join(desc_parts) if desc_parts else status_line
    return f"### {title}\n{desc}"


def create_demo_interface():
    with gr.Blocks(
        title="VibeVoice - Conference Generator",
        theme=theme,
    ) as interface:
        gr.HTML("""
        <div style="width: 100%; margin-bottom: 20px;">
            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/banner.png" 
                style="width: 100%; height: auto; border-radius: 15px; box-shadow: 0 10px 40px rgba(0,0,0,0.2);"
                alt="VibeVoice Banner">
        </div>
        """)
        gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5B first.")
        
        with gr.Tabs():
            with gr.Tab("Generate"):
                gr.Markdown("### Generated Conference")
                primary_status = gr.Markdown(
                    value=READY_PRIMARY_STATUS,
                    elem_id="primary-status",
                )
                complete_audio_output = gr.Audio(
                    label=AUDIO_LABEL_DEFAULT,
                    type="numpy",
                    autoplay=False,
                    show_download_button=True,
                )
                
                with gr.Row():
                    with gr.Column(scale=1):
                        gr.Markdown("### Conference Settings")
                        model_dropdown = gr.Dropdown(
                            choices=AVAILABLE_MODELS,
                            value=AVAILABLE_MODELS[0],
                            label="Model",
                        )
                        num_speakers = gr.Slider(
                            minimum=1, maximum=4, value=2, step=1,
                            label="Number of Speakers",
                        )

                        gr.Markdown("### Speaker Selection")
                        speaker_selections = []
                        for i in range(4):
                            speaker = gr.Dropdown(
                                choices=AVAILABLE_VOICES,
                                value=DEFAULT_SPEAKERS[i] if i < len(DEFAULT_SPEAKERS) else None,
                                label=f"Speaker {i+1}",
                                visible=(i < 2),
                            )
                            speaker_selections.append(speaker)

                        with gr.Accordion("Advanced Settings", open=False):
                            cfg_scale = gr.Slider(
                                minimum=1.0, maximum=2.0, value=1.3, step=0.05,
                                label="CFG Scale (Guidance Strength)",
                            )

                    with gr.Column(scale=2):
                        gr.Markdown("### Script Input")
                        script_input = gr.Textbox(
                            label="Conversation Script",
                            placeholder="Enter your conference script here...",
                            lines=12,
                            max_lines=20,
                        )
                        
                        with gr.Row():
                            with gr.Column(scale=1):
                                gr.Markdown("### Example Scripts")
                                with gr.Row():
                                    use_natural = gr.Checkbox(
                                        value=True,
                                        label="Natural talking sounds",
                                        scale=1
                                    )
                                    duration_display = gr.Textbox(
                                        value="",
                                        label="Est. Duration",
                                        interactive=False,
                                        scale=1
                                    )
                        
                        example_names = [
                            "AI TED Talk",
                            "Political Speech",
                            "Finance IPO Meeting",
                            "Telehealth Meeting",
                            "Military Meeting",
                            "Oil Meeting",
                            "Game Creation Meeting",
                            "Product Meeting"
                        ]
                        
                        example_buttons = []
                        with gr.Row():
                            for i in range(min(4, len(example_names))):
                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
                                example_buttons.append(btn)
                        
                        with gr.Row():
                            for i in range(4, min(8, len(example_names))):
                                btn = gr.Button(example_names[i], size="sm", variant="secondary")
                                example_buttons.append(btn)
                        
                        generate_btn = gr.Button(
                            "🚀 Generate Conference (on Modal)", size="lg",
                            variant="primary",
                        )
                        log_output = gr.Textbox(
                            label="Generation Log",
                            lines=8, max_lines=15,
                            interactive=False,
                        )
                        with gr.Row():
                            status_display = gr.Markdown(
                                value="**Idle**\nPress generate to get started.",
                                elem_id="status-display",
                            )
                            progress_slider = gr.Slider(
                                minimum=0,
                                maximum=100,
                                value=0,
                                step=1,
                                label="Progress",
                                interactive=False,
                            )

                def update_speaker_visibility(num_speakers):
                    return [gr.update(visible=(i < num_speakers)) for i in range(4)]
                
                def estimate_duration(script):
                    """Estimate duration based on word count."""
                    if not script:
                        return ""
                    words = len(script.split())
                    # Approximate 150 words per minute for natural speech
                    minutes = words / 150
                    if minutes < 1:
                        return f"~{int(minutes * 60)} seconds"
                    else:
                        return f"~{minutes:.1f} minutes"
                
                def smart_speaker_selection(gender_list):
                    """Select speakers based on gender requirements."""
                    selected = []
                    for gender in gender_list:
                        if gender == "male" and MALE_VOICES:
                            available = [v for v in MALE_VOICES if v not in selected]
                            if available:
                                selected.append(available[0])
                            else:
                                selected.append(MALE_VOICES[0])
                        elif gender == "female" and FEMALE_VOICES:
                            available = [v for v in FEMALE_VOICES if v not in selected]
                            if available:
                                selected.append(available[0])
                            else:
                                selected.append(FEMALE_VOICES[0])
                        else:
                            # neutral or fallback
                            available = [v for v in AVAILABLE_VOICES if v not in selected]
                            if available:
                                selected.append(available[0])
                            else:
                                selected.append(AVAILABLE_VOICES[0])
                    return selected
                
                def load_specific_example(idx, natural):
                    """Load a specific example script."""
                    if idx >= len(EXAMPLE_SCRIPTS):
                        return [2, "", ""] + [None, None, None, None]
                    
                    script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
                    genders = SCRIPT_SPEAKER_GENDERS[idx] if idx < len(SCRIPT_SPEAKER_GENDERS) else ["neutral"]
                    speakers = smart_speaker_selection(genders)
                    duration = estimate_duration(script)
                    
                    # Pad speakers to 4
                    while len(speakers) < 4:
                        speakers.append(None)
                    
                    return [len(genders), script, duration] + speakers[:4]
                
                # Connect example buttons
                for idx, btn in enumerate(example_buttons):
                    btn.click(
                        fn=lambda nat, i=idx: load_specific_example(i, nat),
                        inputs=[use_natural],
                        outputs=[num_speakers, script_input, duration_display] + speaker_selections,
                        queue=False
                    )
                
                # Update duration when script changes
                script_input.change(
                    fn=estimate_duration,
                    inputs=[script_input],
                    outputs=[duration_display],
                    queue=False
                )

                num_speakers.change(
                    fn=update_speaker_visibility,
                    inputs=[num_speakers],
                    outputs=speaker_selections
                )

                def generate_podcast_wrapper(model_choice, num_speakers_val, script, *speakers_and_params):
                    if remote_generate_function is None:
                        error_message = "ERROR: Modal function not deployed. Please contact the space owner."
                        primary_error = build_primary_status("error", "Modal backend is offline.")
                        yield (
                            gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                            error_message,
                            "**Error**\nModal backend unavailable.",
                            gr.update(value=0),
                            primary_error,
                        )
                        return

                    connecting_status_line = "Provisioning GPU resources... cold starts can take up to a minute."
                    primary_connecting = build_primary_status("connecting", connecting_status_line)
                    status_detail = "**Connecting**\nRequesting GPU resources…"

                    yield (
                        gr.update(label=AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)),
                        "🔄 Calling remote GPU on Modal.com... this may take a moment to start.",
                        status_detail,
                        gr.update(value=1),
                        primary_connecting,
                    )

                    try:
                        speakers = speakers_and_params[:4]
                        cfg_scale_val = speakers_and_params[4]
                        current_log = ""
                        last_pct = 1
                        last_status = status_detail
                        last_primary = primary_connecting
                        last_audio_label = AUDIO_STAGE_LABELS.get("connecting", AUDIO_LABEL_DEFAULT)
                        last_stage = "connecting"

                        # Stream updates from the Modal function
                        for update in remote_generate_function.remote_gen(
                            num_speakers=int(num_speakers_val),
                            script=script,
                            speaker_1=speakers[0],
                            speaker_2=speakers[1],
                            speaker_3=speakers[2],
                            speaker_4=speakers[3],
                            cfg_scale=cfg_scale_val,
                            model_name=model_choice
                        ):
                            if not update:
                                continue

                            if isinstance(update, dict):
                                audio_payload = update.get("audio")
                                progress_pct = update.get("pct", last_pct)
                                stage_key = update.get("stage", last_stage) or last_stage
                                status_line = update.get("status") or "Processing..."
                                current_log = update.get("log", current_log)

                                stage_label = stage_key.replace("_", " ").title() if stage_key else "Status"
                                status_formatted = f"**{stage_label}**\n{status_line}"
                                progress_value = max(0, min(100, int(round(progress_pct))))

                                audio_label = AUDIO_STAGE_LABELS.get(stage_key)
                                if not audio_label:
                                    audio_label = f"Complete Conference ({stage_label.lower()})" if stage_label else AUDIO_LABEL_DEFAULT
                                if stage_key == "complete":
                                    audio_label = AUDIO_LABEL_DEFAULT
                                if stage_key == "error":
                                    progress_value = 0

                                primary_value = build_primary_status(stage_key, status_line)

                                audio_update = gr.update(label=audio_label)
                                if audio_payload is not None:
                                    audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)

                                yield (
                                    audio_update,
                                    current_log,
                                    status_formatted,
                                    gr.update(value=progress_value),
                                    primary_value,
                                )

                                last_pct = progress_value
                                last_status = status_formatted
                                last_primary = primary_value
                                last_audio_label = audio_label
                                last_stage = stage_key
                            else:
                                # Backwards compatibility: older backend returns (audio, log)
                                audio_payload, log_text = update if isinstance(update, (tuple, list)) else (None, str(update))
                                status_line = None
                                if log_text:
                                    current_log = log_text
                                    status_line = log_text.splitlines()[-1]
                                if not status_line:
                                    status_line = "Processing..."

                                if audio_payload is not None:
                                    progress_value = 100
                                    audio_label = AUDIO_LABEL_DEFAULT
                                    primary_value = build_primary_status("complete", "Conference ready to download.")
                                    status_formatted = "**Complete**\nConference ready to download."
                                else:
                                    progress_value = max(last_pct, 70)
                                    audio_label = AUDIO_STAGE_LABELS.get("generating_audio", last_audio_label)
                                    primary_value = build_primary_status("generating_audio", status_line)
                                    status_formatted = f"**Streaming**\n{status_line}"

                                audio_update = gr.update(label=audio_label)
                                if audio_payload is not None:
                                    audio_update = gr.update(value=audio_payload, label=AUDIO_LABEL_DEFAULT)

                                last_pct = progress_value
                                last_status = status_formatted
                                last_primary = primary_value
                                last_audio_label = audio_label

                                yield (
                                    audio_update,
                                    current_log,
                                    status_formatted,
                                    gr.update(value=progress_value),
                                    primary_value,
                                )
                    except Exception as e:
                        tb = traceback.format_exc()
                        print(f"Error calling Modal: {e}")
                        error_log = f"❌ An error occurred: {e}\n\n{tb}"
                        primary_error = build_primary_status("error", "Inference failed.")
                        yield (
                            gr.update(label=AUDIO_STAGE_LABELS.get("error", AUDIO_LABEL_DEFAULT)),
                            error_log,
                            "**Error**\nInference failed.",
                            gr.update(value=0),
                            primary_error,
                        )

                generate_btn.click(
                    fn=generate_podcast_wrapper,
                    inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
                    outputs=[complete_audio_output, log_output, status_display, progress_slider, primary_status]
                )
            
            with gr.Tab("Architecture"):
                with gr.Row():
                    gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, 
                    such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly 
                    in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous 
                    speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently 
                    preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice 
                    employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and 
                    dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to 
                    90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.""")
                with gr.Row():    
                    with gr.Column():
                        gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")

                        gr.Markdown("""
                        ### Overview
                        
                        VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio, 
                        such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, 
                        particularly in scalability, speaker consistency, and natural turn-taking.
                        
                        ### Key Features
                        
                        - **Multi-Speaker Support**: Handles up to 4 distinct speakers
                        - **Long-Form Generation**: Synthesizes speech up to 90 minutes
                        - **Natural Conversation Flow**: Includes turn-taking and interruptions
                        - **Ultra-Low Frame Rate**: 7.5 Hz tokenizers for efficiency
                        - **High Fidelity**: Preserves acoustic details while being computationally efficient
                        
                        ### Technical Architecture
                        
                        1. **Continuous Speech Tokenizers**: Acoustic and Semantic tokenizers at 7.5 Hz
                        2. **Next-Token Diffusion Framework**: Combines LLM understanding with diffusion generation
                        3. **Large Language Model**: Understands context and dialogue flow
                        4. **Diffusion Head**: Generates high-fidelity acoustic details
                        """)
                        
                    with gr.Column():
                        gr.HTML("""
                        <div style="width: 100%; padding: 20px;">
                            <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.jpg" 
                                style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 5px 20px rgba(0,0,0,0.15);"
                                alt="VibeVoice Architecture Diagram">
                        </div>
                        """)
                        
                        gr.Markdown("""
                        ### Model Variants
                        
                        **VibeVoice-1.5B**: Faster inference, suitable for real-time applications
                        **VibeVoice-7B**: Higher quality output, recommended for production use
                        
                        ### Performance Metrics
                        
                        <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png" 
                            style="width: 100%; height: auto; border-radius: 10px; margin-top: 20px;"
                            alt="Performance Comparison">
                        """)
    return interface

# --- Main Execution ---
if __name__ == "__main__":
    if remote_generate_function is None:
        # If Modal isn't set up, we can't launch the full app.
        # We'll show a simplified UI with an error message.
        with gr.Blocks(theme=theme) as interface:
            gr.Markdown("# ❌ Configuration Error")
            gr.Markdown(
                "The Gradio application cannot connect to the Modal backend. "
                "The Modal app has not been deployed yet. "
                "Please run `modal deploy modal_runner.py` in your terminal and then refresh this page."
            )
        interface.launch()
    else:
        # Launch the full Gradio interface
        interface = create_demo_interface()
        interface.queue().launch(show_error=True)