ACloudCenter commited on
Commit
0207836
·
1 Parent(s): 0a19e95

Fix Missing UI comps

Browse files
Files changed (2) hide show
  1. app.py +217 -1
  2. backend_modal/modal_runner.py +54 -1
app.py CHANGED
@@ -19,6 +19,75 @@ AVAILABLE_VOICES = [
19
  ]
20
  DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # --- Modal Connection ---
23
  try:
24
  # Look up the remote class
@@ -54,7 +123,7 @@ def create_demo_interface():
54
  alt="VibeVoice Banner">
55
  </div>
56
  """)
57
- gr.Markdown("## GPU processing is now offloaded to a Modal.com backend!")
58
 
59
  with gr.Tabs():
60
  with gr.Tab("Generate"):
@@ -104,6 +173,45 @@ def create_demo_interface():
104
  lines=12,
105
  max_lines=20,
106
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  generate_btn = gr.Button(
108
  "🚀 Generate Conference (on Modal)", size="lg",
109
  variant="primary",
@@ -116,6 +224,55 @@ def create_demo_interface():
116
 
117
  def update_speaker_visibility(num_speakers):
118
  return [gr.update(visible=(i < num_speakers)) for i in range(4)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
  num_speakers.change(
121
  fn=update_speaker_visibility,
@@ -156,6 +313,65 @@ def create_demo_interface():
156
  inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
157
  outputs=[complete_audio_output, log_output]
158
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  return interface
160
 
161
  # --- Main Execution ---
 
19
  ]
20
  DEFAULT_SPEAKERS = ['en-Alice_woman', 'en-Carter_man', 'en-Frank_man', 'en-Maya_woman']
21
 
22
+ # Male and female voice categories for smart speaker selection
23
+ MALE_VOICES = [
24
+ "en-Carter_man",
25
+ "en-Frank_man",
26
+ "en-Yasser_man",
27
+ "in-Samuel_man",
28
+ "zh-Anchen_man_bgm",
29
+ "zh-Bowen_man"
30
+ ]
31
+ FEMALE_VOICES = [
32
+ "en-Alice_woman_bgm",
33
+ "en-Alice_woman",
34
+ "en-Maya_woman",
35
+ "zh-Xinran_woman"
36
+ ]
37
+
38
+ # Load example scripts
39
+ def load_example_scripts():
40
+ examples_dir = "text_examples"
41
+ example_scripts = []
42
+ example_scripts_natural = []
43
+
44
+ if not os.path.exists(examples_dir):
45
+ return example_scripts, example_scripts_natural
46
+
47
+ original_files = [
48
+ "1p_ai_tedtalk.txt",
49
+ "1p_politcal_speech.txt",
50
+ "2p_financeipo_meeting.txt",
51
+ "2p_telehealth_meeting.txt",
52
+ "3p_military_meeting.txt",
53
+ "3p_oil_meeting.txt",
54
+ "4p_gamecreation_meeting.txt",
55
+ "4p_product_meeting.txt"
56
+ ]
57
+
58
+ for txt_file in original_files:
59
+ file_path = os.path.join(examples_dir, txt_file)
60
+ natural_file = txt_file.replace(".txt", "_natural.txt")
61
+ natural_path = os.path.join(examples_dir, natural_file)
62
+
63
+ if os.path.exists(file_path):
64
+ with open(file_path, 'r', encoding='utf-8') as f:
65
+ example_scripts.append(f.read())
66
+ else:
67
+ example_scripts.append("")
68
+
69
+ if os.path.exists(natural_path):
70
+ with open(natural_path, 'r', encoding='utf-8') as f:
71
+ example_scripts_natural.append(f.read())
72
+ else:
73
+ example_scripts_natural.append(example_scripts[-1] if example_scripts else "")
74
+
75
+ return example_scripts, example_scripts_natural
76
+
77
+ # Gender mapping for each script's speakers
78
+ SCRIPT_SPEAKER_GENDERS = [
79
+ ["female"], # AI TED Talk - Rachel
80
+ ["neutral"], # Political Speech - generic speaker
81
+ ["male", "female"], # Finance IPO - James, Patricia
82
+ ["female", "male"], # Telehealth - Jennifer, Tom
83
+ ["female", "male", "female"], # Military - Sarah, David, Lisa
84
+ ["male", "female", "male"], # Oil - Robert, Lisa, Michael
85
+ ["male", "female", "male", "male"], # Game Creation - Alex, Sarah, Marcus, Emma
86
+ ["female", "male", "female", "male"] # Product Meeting - Sarah, Marcus, Jennifer, David
87
+ ]
88
+
89
+ EXAMPLE_SCRIPTS, EXAMPLE_SCRIPTS_NATURAL = load_example_scripts()
90
+
91
  # --- Modal Connection ---
92
  try:
93
  # Look up the remote class
 
123
  alt="VibeVoice Banner">
124
  </div>
125
  """)
126
+ gr.Markdown("## NOTE: The Large model takes significant generation time with limited increase in quality. I recommend trying 1.5B first.")
127
 
128
  with gr.Tabs():
129
  with gr.Tab("Generate"):
 
173
  lines=12,
174
  max_lines=20,
175
  )
176
+
177
+ with gr.Row():
178
+ with gr.Column(scale=1):
179
+ gr.Markdown("### Example Scripts")
180
+ with gr.Row():
181
+ use_natural = gr.Checkbox(
182
+ value=True,
183
+ label="Natural talking sounds",
184
+ scale=1
185
+ )
186
+ duration_display = gr.Textbox(
187
+ value="",
188
+ label="Est. Duration",
189
+ interactive=False,
190
+ scale=1
191
+ )
192
+
193
+ example_names = [
194
+ "AI TED Talk",
195
+ "Political Speech",
196
+ "Finance IPO Meeting",
197
+ "Telehealth Meeting",
198
+ "Military Meeting",
199
+ "Oil Meeting",
200
+ "Game Creation Meeting",
201
+ "Product Meeting"
202
+ ]
203
+
204
+ example_buttons = []
205
+ with gr.Row():
206
+ for i in range(min(4, len(example_names))):
207
+ btn = gr.Button(example_names[i], size="sm", variant="secondary")
208
+ example_buttons.append(btn)
209
+
210
+ with gr.Row():
211
+ for i in range(4, min(8, len(example_names))):
212
+ btn = gr.Button(example_names[i], size="sm", variant="secondary")
213
+ example_buttons.append(btn)
214
+
215
  generate_btn = gr.Button(
216
  "🚀 Generate Conference (on Modal)", size="lg",
217
  variant="primary",
 
224
 
225
  def update_speaker_visibility(num_speakers):
226
  return [gr.update(visible=(i < num_speakers)) for i in range(4)]
227
+
228
+ def smart_speaker_selection(gender_list):
229
+ """Select speakers based on gender requirements."""
230
+ selected = []
231
+ for gender in gender_list:
232
+ if gender == "male" and MALE_VOICES:
233
+ available = [v for v in MALE_VOICES if v not in selected]
234
+ if available:
235
+ selected.append(available[0])
236
+ else:
237
+ selected.append(MALE_VOICES[0])
238
+ elif gender == "female" and FEMALE_VOICES:
239
+ available = [v for v in FEMALE_VOICES if v not in selected]
240
+ if available:
241
+ selected.append(available[0])
242
+ else:
243
+ selected.append(FEMALE_VOICES[0])
244
+ else:
245
+ # neutral or fallback
246
+ available = [v for v in AVAILABLE_VOICES if v not in selected]
247
+ if available:
248
+ selected.append(available[0])
249
+ else:
250
+ selected.append(AVAILABLE_VOICES[0])
251
+ return selected
252
+
253
+ def load_specific_example(idx, natural):
254
+ """Load a specific example script."""
255
+ if idx >= len(EXAMPLE_SCRIPTS):
256
+ return [2, ""] + [None, None, None, None]
257
+
258
+ script = EXAMPLE_SCRIPTS_NATURAL[idx] if natural else EXAMPLE_SCRIPTS[idx]
259
+ genders = SCRIPT_SPEAKER_GENDERS[idx] if idx < len(SCRIPT_SPEAKER_GENDERS) else ["neutral"]
260
+ speakers = smart_speaker_selection(genders)
261
+
262
+ # Pad speakers to 4
263
+ while len(speakers) < 4:
264
+ speakers.append(None)
265
+
266
+ return [len(genders), script] + speakers[:4]
267
+
268
+ # Connect example buttons
269
+ for idx, btn in enumerate(example_buttons):
270
+ btn.click(
271
+ fn=lambda nat, i=idx: load_specific_example(i, nat),
272
+ inputs=[use_natural],
273
+ outputs=[num_speakers, script_input] + speaker_selections,
274
+ queue=False
275
+ )
276
 
277
  num_speakers.change(
278
  fn=update_speaker_visibility,
 
313
  inputs=[model_dropdown, num_speakers, script_input] + speaker_selections + [cfg_scale],
314
  outputs=[complete_audio_output, log_output]
315
  )
316
+
317
+ with gr.Tab("Architecture"):
318
+ with gr.Row():
319
+ gr.Markdown("""VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
320
+ such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems, particularly
321
+ in scalability, speaker consistency, and natural turn-taking. A core innovation of VibeVoice is its use of continuous
322
+ speech tokenizers (Acoustic and Semantic) operating at an ultra-low frame rate of 7.5 Hz. These tokenizers efficiently
323
+ preserve audio fidelity while significantly boosting computational efficiency for processing long sequences. VibeVoice
324
+ employs a next-token diffusion framework, leveraging a Large Language Model (LLM) to understand textual context and
325
+ dialogue flow, and a diffusion head to generate high-fidelity acoustic details. The model can synthesize speech up to
326
+ 90 minutes long with up to 4 distinct speakers, surpassing the typical 1-2 speaker limits of many prior models.""")
327
+ with gr.Row():
328
+ with gr.Column():
329
+ gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
330
+
331
+ gr.Markdown("""
332
+ ### Overview
333
+
334
+ VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
335
+ such as conferences, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems,
336
+ particularly in scalability, speaker consistency, and natural turn-taking.
337
+
338
+ ### Key Features
339
+
340
+ - **Multi-Speaker Support**: Handles up to 4 distinct speakers
341
+ - **Long-Form Generation**: Synthesizes speech up to 90 minutes
342
+ - **Natural Conversation Flow**: Includes turn-taking and interruptions
343
+ - **Ultra-Low Frame Rate**: 7.5 Hz tokenizers for efficiency
344
+ - **High Fidelity**: Preserves acoustic details while being computationally efficient
345
+
346
+ ### Technical Architecture
347
+
348
+ 1. **Continuous Speech Tokenizers**: Acoustic and Semantic tokenizers at 7.5 Hz
349
+ 2. **Next-Token Diffusion Framework**: Combines LLM understanding with diffusion generation
350
+ 3. **Large Language Model**: Understands context and dialogue flow
351
+ 4. **Diffusion Head**: Generates high-fidelity acoustic details
352
+ """)
353
+
354
+ with gr.Column():
355
+ gr.HTML("""
356
+ <div style="width: 100%; padding: 20px;">
357
+ <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.jpg"
358
+ style="width: 100%; height: auto; border-radius: 10px; box-shadow: 0 5px 20px rgba(0,0,0,0.15);"
359
+ alt="VibeVoice Architecture Diagram">
360
+ </div>
361
+ """)
362
+
363
+ gr.Markdown("""
364
+ ### Model Variants
365
+
366
+ **VibeVoice-1.5B**: Faster inference, suitable for real-time applications
367
+ **VibeVoice-7B**: Higher quality output, recommended for production use
368
+
369
+ ### Performance Metrics
370
+
371
+ <img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png"
372
+ style="width: 100%; height: auto; border-radius: 10px; margin-top: 20px;"
373
+ alt="Performance Comparison">
374
+ """)
375
  return interface
376
 
377
  # --- Main Execution ---
backend_modal/modal_runner.py CHANGED
@@ -29,7 +29,7 @@ image = (
29
  .add_local_dir("backend_modal/modular", remote_path="/root/modular")
30
  .add_local_dir("backend_modal/processor", remote_path="/root/processor")
31
  .add_local_dir("backend_modal/voices", remote_path="/root/voices")
32
- .add_local_dir("./text_examples", remote_path="/root/text_examples")
33
  .add_local_dir("backend_modal/schedule", remote_path="/root/schedule")
34
  )
35
 
@@ -117,6 +117,59 @@ class VibeVoiceModel:
117
  print(f"Error reading audio {audio_path}: {e}")
118
  return np.array([])
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  @modal.method()
121
  def generate_podcast(self,
122
  num_speakers: int,
 
29
  .add_local_dir("backend_modal/modular", remote_path="/root/modular")
30
  .add_local_dir("backend_modal/processor", remote_path="/root/processor")
31
  .add_local_dir("backend_modal/voices", remote_path="/root/voices")
32
+ .add_local_dir("text_examples", remote_path="/root/text_examples")
33
  .add_local_dir("backend_modal/schedule", remote_path="/root/schedule")
34
  )
35
 
 
117
  print(f"Error reading audio {audio_path}: {e}")
118
  return np.array([])
119
 
120
+ @staticmethod
121
+ def _infer_num_speakers_from_script(script: str) -> int:
122
+ """
123
+ Infer number of speakers by counting distinct 'Speaker X:' tags in the script.
124
+ Robust to 0- or 1-indexed labels and repeated turns.
125
+ Falls back to 1 if none found.
126
+ """
127
+ import re
128
+ ids = re.findall(r'(?mi)^\s*Speaker\s+(\d+)\s*:', script)
129
+ return len({int(x) for x in ids}) if ids else 1
130
+
131
+ @modal.method()
132
+ def get_example_scripts(self):
133
+ examples_dir = "/root/text_examples"
134
+ example_scripts = []
135
+ example_scripts_natural = []
136
+ if not os.path.exists(examples_dir):
137
+ return [], []
138
+
139
+ original_files = [
140
+ "1p_ai_tedtalk.txt",
141
+ "1p_politcal_speech.txt",
142
+ "2p_financeipo_meeting.txt",
143
+ "2p_telehealth_meeting.txt",
144
+ "3p_military_meeting.txt",
145
+ "3p_oil_meeting.txt",
146
+ "4p_gamecreation_meeting.txt",
147
+ "4p_product_meeting.txt"
148
+ ]
149
+
150
+ for txt_file in original_files:
151
+ try:
152
+ with open(os.path.join(examples_dir, txt_file), 'r', encoding='utf-8') as f:
153
+ script_content = f.read().strip()
154
+ if script_content:
155
+ num_speakers = self._infer_num_speakers_from_script(script_content)
156
+ example_scripts.append([num_speakers, script_content])
157
+
158
+ natural_file = txt_file.replace('.txt', '_natural.txt')
159
+ natural_path = os.path.join(examples_dir, natural_file)
160
+ if os.path.exists(natural_path):
161
+ with open(natural_path, 'r', encoding='utf-8') as f:
162
+ natural_content = f.read().strip()
163
+ if natural_content:
164
+ num_speakers = self._infer_num_speakers_from_script(natural_content)
165
+ example_scripts_natural.append([num_speakers, natural_content])
166
+ else:
167
+ example_scripts_natural.append([num_speakers, script_content])
168
+ except Exception as e:
169
+ print(f"Error loading {txt_file}: {e}")
170
+
171
+ return example_scripts, example_scripts_natural
172
+
173
  @modal.method()
174
  def generate_podcast(self,
175
  num_speakers: int,