Spaces:
Running
Running
| import gradio as gr | |
| import torch | |
| import torchaudio | |
| import os | |
| import tempfile | |
| import spaces | |
| from datetime import datetime | |
| from omnivoice import OmniVoice | |
| # ─── Model ─── | |
| print("モデルを読み込み中...") | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.float16 if device == "cuda" else torch.float32 | |
| model = OmniVoice.from_pretrained("drbaph/OmniVoice-bf16", device_map=device, dtype=dtype) | |
| print(f"モデル読み込み完了({device})") | |
| def _build_instruct(gender, age, pitch, style): | |
| parts = [] | |
| if gender and gender != "Auto": | |
| parts.append(gender.lower()) | |
| if age and age != "Auto": | |
| parts.append(age.lower()) | |
| if pitch and pitch != "Auto": | |
| parts.append(f"{pitch.lower()} pitch") | |
| if style and style != "Auto": | |
| parts.append(style.lower()) | |
| return ", ".join(parts) if parts else None | |
| # ─── Voice Design / Auto ─── | |
| def generate_design(text, mode, language, gender, age, pitch, style, | |
| speed, duration, num_step, guidance_scale, denoise, postprocess): | |
| if not text or not text.strip(): | |
| return None, "テキストを入力してください。" | |
| kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise) | |
| if language and language != "Auto": | |
| kwargs["language"] = language | |
| if mode == "Voice Design": | |
| instruct = _build_instruct(gender, age, pitch, style) | |
| if instruct: | |
| kwargs["instruct"] = instruct | |
| if duration and duration > 0: | |
| kwargs["duration"] = duration | |
| else: | |
| kwargs["speed"] = speed | |
| if postprocess: | |
| kwargs["postprocess_output"] = True | |
| try: | |
| audio = model.generate(text=text, **kwargs) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| torchaudio.save(f.name, audio[0], 24000) | |
| return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)" | |
| except Exception as e: | |
| return None, f"エラー: {e}" | |
| # ─── Voice Clone ─── | |
| def generate_clone(text, ref_audio, ref_text, language, speed, duration, | |
| num_step, guidance_scale, denoise, postprocess): | |
| if not text or not text.strip(): | |
| return None, "テキストを入力してください。" | |
| if ref_audio is None: | |
| return None, "リファレンス音声をアップロードしてください。" | |
| kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise) | |
| if language and language != "Auto": | |
| kwargs["language"] = language | |
| if duration and duration > 0: | |
| kwargs["duration"] = duration | |
| else: | |
| kwargs["speed"] = speed | |
| if postprocess: | |
| kwargs["postprocess_output"] = True | |
| try: | |
| audio = model.generate( | |
| text=text, | |
| ref_audio=ref_audio, | |
| ref_text=ref_text if ref_text and ref_text.strip() else None, | |
| **kwargs, | |
| ) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
| torchaudio.save(f.name, audio[0], 24000) | |
| return f.name, f"生成完了({audio[0].shape[1]/24000:.1f}秒)" | |
| except Exception as e: | |
| return None, f"エラー: {e}" | |
| # ─── UI ─── | |
| CSS = """ | |
| .main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; } | |
| .subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; } | |
| footer { display: none !important; } | |
| """ | |
| with gr.Blocks(title="OmniVoice") as app: | |
| gr.HTML("<h1 class='main-title'>OmniVoice</h1>") | |
| gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>") | |
| with gr.Tabs(): | |
| # ── Voice Design / Auto ── | |
| with gr.Tab("Voice Design"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| d_text = gr.Textbox(label="読み上げテキスト", lines=4, | |
| placeholder="テキストを入力...") | |
| d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード") | |
| d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"], | |
| value="Auto", label="言語") | |
| with gr.Group(visible=False) as d_voice_opts: | |
| with gr.Row(): | |
| d_gender = gr.Dropdown(["Auto", "Female", "Male"], | |
| value="Auto", label="性別") | |
| d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"], | |
| value="Auto", label="年齢") | |
| with gr.Row(): | |
| d_pitch = gr.Dropdown( | |
| ["Auto", "Very low", "Low", "Moderate", "High", "Very high"], | |
| value="Auto", label="ピッチ") | |
| d_style = gr.Dropdown(["Auto", "Whisper"], | |
| value="Auto", label="スタイル") | |
| d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度") | |
| with gr.Accordion("詳細設定", open=False): | |
| d_duration = gr.Number(value=0, label="Duration(秒)", | |
| info="0で自動。設定するとSpeedは無視") | |
| d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps") | |
| d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale") | |
| d_denoise = gr.Checkbox(value=True, label="Denoise") | |
| d_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)") | |
| d_btn = gr.Button("音声を生成", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| d_audio = gr.Audio(label="生成結果", type="filepath") | |
| d_status = gr.Textbox(label="ステータス", interactive=False) | |
| d_mode.change( | |
| fn=lambda m: gr.update(visible=m == "Voice Design"), | |
| inputs=d_mode, outputs=d_voice_opts, | |
| ) | |
| d_btn.click( | |
| fn=generate_design, | |
| inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style, | |
| d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess], | |
| outputs=[d_audio, d_status], | |
| ) | |
| # ── Voice Clone ── | |
| with gr.Tab("Voice Clone"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| c_text = gr.Textbox(label="読み上げテキスト", lines=4, | |
| placeholder="この声で読み上げたいテキスト...") | |
| c_ref = gr.Audio(label="リファレンス音声(3〜15秒)", type="filepath") | |
| c_ref_text = gr.Textbox(label="書き起こし(任意)", lines=2, | |
| placeholder="省略すると自動書き起こし") | |
| c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"], | |
| value="Auto", label="言語") | |
| c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度") | |
| with gr.Accordion("詳細設定", open=False): | |
| c_duration = gr.Number(value=0, label="Duration(秒)") | |
| c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps") | |
| c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale") | |
| c_denoise = gr.Checkbox(value=True, label="Denoise") | |
| c_postprocess = gr.Checkbox(value=True, label="Postprocess(無音除去)") | |
| c_btn = gr.Button("音声を生成", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| c_audio = gr.Audio(label="生成結果", type="filepath") | |
| c_status = gr.Textbox(label="ステータス", interactive=False) | |
| c_btn.click( | |
| fn=generate_clone, | |
| inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed, | |
| c_duration, c_steps, c_cfg, c_denoise, c_postprocess], | |
| outputs=[c_audio, c_status], | |
| ) | |
| if __name__ == "__main__": | |
| app.launch() | |