sample_rate: 24000 model: autoencoder: _target_: models.autoencoder.waveform.stable_vae.StableVAE encoder: _target_: models.autoencoder.waveform.stable_vae.OobleckEncoder in_channels: 1 channels: 128 c_mults: - 1 - 2 - 4 - 8 strides: - 2 - 4 - 6 - 10 latent_dim: 256 use_snake: true decoder: _target_: models.autoencoder.waveform.stable_vae.OobleckDecoder out_channels: 1 channels: 128 c_mults: - 1 - 2 - 4 - 8 strides: - 2 - 4 - 6 - 10 latent_dim: 128 use_snake: true final_tanh: false io_channels: 1 latent_dim: 128 downsampling_ratio: 480 sample_rate: 24000 pretrained_ckpt: vae/speech_audio_sound_step=1000000.ckpt bottleneck: _target_: models.autoencoder.waveform.stable_vae.VAEBottleneck backbone: _target_: models.dit.audio_dit.LayerFusionAudioDiT img_size: 1000 patch_size: 1 in_chans: 128 out_chans: 128 input_type: 1d embed_dim: 768 depth: 16 num_heads: 12 mlp_ratio: 4.0 qkv_bias: false qk_scale: null qk_norm: layernorm norm_layer: layernorm act_layer: geglu context_norm: true use_checkpoint: false time_fusion: ada ada_sola_rank: 32 ada_sola_alpha: 32 cls_dim: null ta_context_dim: 1024 ta_context_fusion: add ta_context_norm: true context_dim: 1024 context_fusion: cross context_max_length: null context_pe_method: none pe_method: none rope_mode: shared use_conv: true skip: true skip_norm: true content_adapter: _target_: models.content_adapter.CrossAttentionAdapter content_dim: 1024 d_out: 1024 prefix_dim: 1024 num_heads: 16 dropout: 0.2 duration_grad_scale: 0.1 duration_predictor: _target_: models.content_adapter.DurationPredictor in_channels: 1024 filter_channels: 512 n_layers: 5 kernel_size: 3 p_dropout: 0.5 content_dim: 1024 frame_resolution: 0.005 duration_offset: 1.0 cfg_drop_ratio: 0.2 _target_: models.flow_matching.DummyContentAudioFlowMatching content_encoder: _target_: models.content_encoder.content_encoder.ContentEncoder embed_dim: 1024 text_encoder: _target_: models.content_encoder.text_encoder.T5TextEncoder model_name: google/flan-t5-large embed_dim: 1024 midi_encoder: _target_: models.content_encoder.midi_encoder.FastSpeech2MIDIEncoder phone_vocab_size: 61 midi_vocab_size: 300 slur_vocab_size: 2 spk_config: _target_: models.content_encoder.midi_encoder.SpkConfig encoding_format: id num_spk: 20 d_model: 512 num_layers: 4 num_heads: 2 ffn_kernel_size: 9 d_out: 1024 audio_encoder: _target_: models.autoencoder.waveform.stable_vae.StableVAEProjectorWrapper vae_dim: 128 embed_dim: 1024 video_encoder: _target_: models.content_encoder.vision_encoder.MlpVideoEncoder video_feat_dim: 1024 embed_dim: 1024 phoneme_encoder: _target_: models.content_encoder.midi_encoder.FastSpeech2PhonemeEncoder phone_vocab_size: 92 d_model: 512 num_layers: 4 num_heads: 2 ffn_kernel_size: 9 d_out: 1024 spk_config: _target_: models.content_encoder.midi_encoder.SpkConfig encoding_format: embedding spk_embed_dim: 256