omnivoice-personal

Running

App Files Files Community

omnivoice-personal / app.py

neo7team

Update app.py

68baf64 verified 18 days ago

raw

history blame contribute delete

8.58 kB

	import gradio as gr
	import torch
	import torchaudio
	import os
	import tempfile
	import spaces
	from datetime import datetime
	from omnivoice import OmniVoice

	# ─── Model ───
	print("モデルを読み込み中...")
	device = "cuda" if torch.cuda.is_available() else "cpu"
	dtype = torch.float16 if device == "cuda" else torch.float32
	model = OmniVoice.from_pretrained("drbaph/OmniVoice-bf16", device_map=device, dtype=dtype)
	print(f"モデル読み込み完了（{device}）")


	def _build_instruct(gender, age, pitch, style):
	parts = []
	if gender and gender != "Auto":
	parts.append(gender.lower())
	if age and age != "Auto":
	parts.append(age.lower())
	if pitch and pitch != "Auto":
	parts.append(f"{pitch.lower()} pitch")
	if style and style != "Auto":
	parts.append(style.lower())
	return ", ".join(parts) if parts else None


	# ─── Voice Design / Auto ───
	@spaces.GPU
	def generate_design(text, mode, language, gender, age, pitch, style,
	speed, duration, num_step, guidance_scale, denoise, postprocess):
	if not text or not text.strip():
	return None, "テキストを入力してください。"

	kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)

	if language and language != "Auto":
	kwargs["language"] = language

	if mode == "Voice Design":
	instruct = _build_instruct(gender, age, pitch, style)
	if instruct:
	kwargs["instruct"] = instruct

	if duration and duration > 0:
	kwargs["duration"] = duration
	else:
	kwargs["speed"] = speed

	if postprocess:
	kwargs["postprocess_output"] = True

	try:
	audio = model.generate(text=text, **kwargs)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	torchaudio.save(f.name, audio[0], 24000)
	return f.name, f"生成完了（{audio[0].shape[1]/24000:.1f}秒）"
	except Exception as e:
	return None, f"エラー: {e}"


	# ─── Voice Clone ───
	@spaces.GPU
	def generate_clone(text, ref_audio, ref_text, language, speed, duration,
	num_step, guidance_scale, denoise, postprocess):
	if not text or not text.strip():
	return None, "テキストを入力してください。"
	if ref_audio is None:
	return None, "リファレンス音声をアップロードしてください。"

	kwargs = dict(num_step=int(num_step), guidance_scale=guidance_scale, denoise=denoise)

	if language and language != "Auto":
	kwargs["language"] = language

	if duration and duration > 0:
	kwargs["duration"] = duration
	else:
	kwargs["speed"] = speed

	if postprocess:
	kwargs["postprocess_output"] = True

	try:
	audio = model.generate(
	text=text,
	ref_audio=ref_audio,
	ref_text=ref_text if ref_text and ref_text.strip() else None,
	**kwargs,
	)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	torchaudio.save(f.name, audio[0], 24000)
	return f.name, f"生成完了（{audio[0].shape[1]/24000:.1f}秒）"
	except Exception as e:
	return None, f"エラー: {e}"


	# ─── UI ───
	CSS = """
	.main-title { text-align: center; font-size: 1.8em; font-weight: 800; margin-bottom: 0; }
	.subtitle { text-align: center; color: #888; font-size: 0.9em; margin-bottom: 1em; }
	footer { display: none !important; }
	"""

	with gr.Blocks(title="OmniVoice") as app:
	gr.HTML("<h1 class='main-title'>OmniVoice</h1>")
	gr.HTML("<p class='subtitle'>AI Voice Generator — Personal</p>")

	with gr.Tabs():
	# ── Voice Design / Auto ──
	with gr.Tab("Voice Design"):
	with gr.Row():
	with gr.Column(scale=1):
	d_text = gr.Textbox(label="読み上げテキスト", lines=4,
	placeholder="テキストを入力...")
	d_mode = gr.Radio(["Auto", "Voice Design"], value="Auto", label="モード")
	d_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
	value="Auto", label="言語")

	with gr.Group(visible=False) as d_voice_opts:
	with gr.Row():
	d_gender = gr.Dropdown(["Auto", "Female", "Male"],
	value="Auto", label="性別")
	d_age = gr.Dropdown(["Auto", "Child", "Young", "Middle-aged", "Elderly"],
	value="Auto", label="年齢")
	with gr.Row():
	d_pitch = gr.Dropdown(
	["Auto", "Very low", "Low", "Moderate", "High", "Very high"],
	value="Auto", label="ピッチ")
	d_style = gr.Dropdown(["Auto", "Whisper"],
	value="Auto", label="スタイル")

	d_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")

	with gr.Accordion("詳細設定", open=False):
	d_duration = gr.Number(value=0, label="Duration（秒）",
	info="0で自動。設定するとSpeedは無視")
	d_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
	d_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
	d_denoise = gr.Checkbox(value=True, label="Denoise")
	d_postprocess = gr.Checkbox(value=True, label="Postprocess（無音除去）")

	d_btn = gr.Button("音声を生成", variant="primary", size="lg")

	with gr.Column(scale=1):
	d_audio = gr.Audio(label="生成結果", type="filepath")
	d_status = gr.Textbox(label="ステータス", interactive=False)

	d_mode.change(
	fn=lambda m: gr.update(visible=m == "Voice Design"),
	inputs=d_mode, outputs=d_voice_opts,
	)
	d_btn.click(
	fn=generate_design,
	inputs=[d_text, d_mode, d_lang, d_gender, d_age, d_pitch, d_style,
	d_speed, d_duration, d_steps, d_cfg, d_denoise, d_postprocess],
	outputs=[d_audio, d_status],
	)

	# ── Voice Clone ──
	with gr.Tab("Voice Clone"):
	with gr.Row():
	with gr.Column(scale=1):
	c_text = gr.Textbox(label="読み上げテキスト", lines=4,
	placeholder="この声で読み上げたいテキスト...")
	c_ref = gr.Audio(label="リファレンス音声（3〜15秒）", type="filepath")
	c_ref_text = gr.Textbox(label="書き起こし（任意）", lines=2,
	placeholder="省略すると自動書き起こし")
	c_lang = gr.Dropdown(["Auto", "Japanese", "English", "Korean"],
	value="Auto", label="言語")
	c_speed = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="速度")

	with gr.Accordion("詳細設定", open=False):
	c_duration = gr.Number(value=0, label="Duration（秒）")
	c_steps = gr.Slider(4, 64, value=32, step=1, label="Inference Steps")
	c_cfg = gr.Slider(0.5, 5.0, value=2.0, step=0.1, label="Guidance Scale")
	c_denoise = gr.Checkbox(value=True, label="Denoise")
	c_postprocess = gr.Checkbox(value=True, label="Postprocess（無音除去）")

	c_btn = gr.Button("音声を生成", variant="primary", size="lg")

	with gr.Column(scale=1):
	c_audio = gr.Audio(label="生成結果", type="filepath")
	c_status = gr.Textbox(label="ステータス", interactive=False)

	c_btn.click(
	fn=generate_clone,
	inputs=[c_text, c_ref, c_ref_text, c_lang, c_speed,
	c_duration, c_steps, c_cfg, c_denoise, c_postprocess],
	outputs=[c_audio, c_status],
	)

	if __name__ == "__main__":
	app.launch()