Init repository with huggingface version of MOSS_TTSD_tokenizer

Browse files

Files changed (3) hide show

config.json +124 -0
model.safetensors +3 -0
preprocessor_config.json +13 -0

config.json ADDED Viewed

	@@ -0,0 +1,124 @@

+{
+  "code_dim": 3072,
+  "decoder_upsample_rate": 2560,
+  "dtype": "float32",
+  "encoder_downsample_rate": 1280,
+  "initializer_range": 0.02,
+  "input_sample_rate": 16000,
+  "input_sampling_rate": 16000,
+  "model_type": "xy_tokenizer",
+  "output_sample_rate": 32000,
+  "params": {
+    "acoustic_decoder_kwargs": {
+      "activation_function": "gelu",
+      "d_model": 768,
+      "decoder_attention_heads": 12,
+      "decoder_ffn_dim": 3072,
+      "decoder_layers": 12,
+      "hop_length": 160,
+      "kernel_size": 3,
+      "max_audio_seconds": 30,
+      "num_mel_bins": 80,
+      "sampling_rate": 16000,
+      "scale_embedding": false,
+      "stride_size": 2
+    },
+    "acoustic_encoder_kwargs": {
+      "activation_function": "gelu",
+      "d_model": 768,
+      "encoder_attention_heads": 12,
+      "encoder_ffn_dim": 3072,
+      "encoder_layers": 12,
+      "hop_length": 160,
+      "kernel_size": 3,
+      "max_audio_seconds": 30,
+      "num_mel_bins": 80,
+      "sampling_rate": 16000,
+      "scale_embedding": false,
+      "stride_size": 2
+    },
+    "downsample_kwargs": {
+      "avg_pooler": 4,
+      "d_model": 768
+    },
+    "feature_extractor_kwargs": {
+      "chunk_length": 30,
+      "feature_size": 80,
+      "hop_length": 160,
+      "n_fft": 400,
+      "n_samples": 480000,
+      "nb_max_frames": 3000,
+      "padding_side": "right",
+      "padding_value": 0.0,
+      "return_attention_mask": true,
+      "return_tensors": "pt",
+      "sampling_rate": 16000
+    },
+    "post_rvq_adapter_kwargs": {
+      "d_model": 768,
+      "encoder_attention_heads": 12,
+      "encoder_ffn_dim": 3072,
+      "encoder_layers": 4,
+      "input_dim": 3072,
+      "max_source_positions": 375,
+      "output_dim": 3072
+    },
+    "pre_rvq_adapter_kwargs": {
+      "d_model": 768,
+      "encoder_attention_heads": 12,
+      "encoder_ffn_dim": 3072,
+      "encoder_layers": 4,
+      "input_dim": 1536,
+      "max_source_positions": 1500,
+      "output_dim": 768
+    },
+    "quantizer_kwargs": {
+      "codebook_dim": 512,
+      "codebook_size": 1024,
+      "input_dim": 3072,
+      "num_quantizers": 8,
+      "output_dim": 3072,
+      "quantizer_dropout": 0.0,
+      "rvq_dim": 512
+    },
+    "semantic_encoder_adapter_kwargs": {
+      "d_model": 768,
+      "encoder_attention_heads": 12,
+      "encoder_ffn_dim": 3072,
+      "encoder_layers": 4,
+      "input_dim": 768,
+      "max_source_positions": 1500,
+      "output_dim": 768
+    },
+    "semantic_encoder_kwargs": {
+      "activation_function": "gelu",
+      "d_model": 768,
+      "encoder_attention_heads": 12,
+      "encoder_ffn_dim": 3072,
+      "encoder_layers": 12,
+      "hop_length": 160,
+      "kernel_size": 3,
+      "max_audio_seconds": 30,
+      "num_mel_bins": 80,
+      "sampling_rate": 16000,
+      "scale_embedding": false,
+      "stride_size": 2
+    },
+    "upsample_kwargs": {
+      "d_model": 768,
+      "stride": 4
+    },
+    "vocos_kwargs": {
+      "dim": 512,
+      "hop_size": 320,
+      "input_channels": 80,
+      "intermediate_dim": 4096,
+      "n_fft": 1280,
+      "num_layers": 30,
+      "padding": "same"
+    }
+  },
+  "sampling_rate": 32000,
+  "transformers_version": "4.56.1",
+  "use_cache": true
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5077d3f3445c4ccf1f4a20962c96e448a78c410db53503c87fb53314ae19d68d
+size 2137710040

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+  "chunk_length": 30,
+  "feature_size": 80,
+  "hop_length": 160,
+  "n_fft": 400,
+  "n_samples": 480000,
+  "nb_max_frames": 3000,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "sampling_rate": 16000,
+  "return_attention_mask": true,
+  "return_tensors": "pt"
+}