CloudRipple commited on
Commit
9f16a12
·
verified ·
1 Parent(s): 5b6fd03

Init repository with huggingface version of MOSS_TTSD_tokenizer

Browse files
Files changed (3) hide show
  1. config.json +124 -0
  2. model.safetensors +3 -0
  3. preprocessor_config.json +13 -0
config.json ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "code_dim": 3072,
3
+ "decoder_upsample_rate": 2560,
4
+ "dtype": "float32",
5
+ "encoder_downsample_rate": 1280,
6
+ "initializer_range": 0.02,
7
+ "input_sample_rate": 16000,
8
+ "input_sampling_rate": 16000,
9
+ "model_type": "xy_tokenizer",
10
+ "output_sample_rate": 32000,
11
+ "params": {
12
+ "acoustic_decoder_kwargs": {
13
+ "activation_function": "gelu",
14
+ "d_model": 768,
15
+ "decoder_attention_heads": 12,
16
+ "decoder_ffn_dim": 3072,
17
+ "decoder_layers": 12,
18
+ "hop_length": 160,
19
+ "kernel_size": 3,
20
+ "max_audio_seconds": 30,
21
+ "num_mel_bins": 80,
22
+ "sampling_rate": 16000,
23
+ "scale_embedding": false,
24
+ "stride_size": 2
25
+ },
26
+ "acoustic_encoder_kwargs": {
27
+ "activation_function": "gelu",
28
+ "d_model": 768,
29
+ "encoder_attention_heads": 12,
30
+ "encoder_ffn_dim": 3072,
31
+ "encoder_layers": 12,
32
+ "hop_length": 160,
33
+ "kernel_size": 3,
34
+ "max_audio_seconds": 30,
35
+ "num_mel_bins": 80,
36
+ "sampling_rate": 16000,
37
+ "scale_embedding": false,
38
+ "stride_size": 2
39
+ },
40
+ "downsample_kwargs": {
41
+ "avg_pooler": 4,
42
+ "d_model": 768
43
+ },
44
+ "feature_extractor_kwargs": {
45
+ "chunk_length": 30,
46
+ "feature_size": 80,
47
+ "hop_length": 160,
48
+ "n_fft": 400,
49
+ "n_samples": 480000,
50
+ "nb_max_frames": 3000,
51
+ "padding_side": "right",
52
+ "padding_value": 0.0,
53
+ "return_attention_mask": true,
54
+ "return_tensors": "pt",
55
+ "sampling_rate": 16000
56
+ },
57
+ "post_rvq_adapter_kwargs": {
58
+ "d_model": 768,
59
+ "encoder_attention_heads": 12,
60
+ "encoder_ffn_dim": 3072,
61
+ "encoder_layers": 4,
62
+ "input_dim": 3072,
63
+ "max_source_positions": 375,
64
+ "output_dim": 3072
65
+ },
66
+ "pre_rvq_adapter_kwargs": {
67
+ "d_model": 768,
68
+ "encoder_attention_heads": 12,
69
+ "encoder_ffn_dim": 3072,
70
+ "encoder_layers": 4,
71
+ "input_dim": 1536,
72
+ "max_source_positions": 1500,
73
+ "output_dim": 768
74
+ },
75
+ "quantizer_kwargs": {
76
+ "codebook_dim": 512,
77
+ "codebook_size": 1024,
78
+ "input_dim": 3072,
79
+ "num_quantizers": 8,
80
+ "output_dim": 3072,
81
+ "quantizer_dropout": 0.0,
82
+ "rvq_dim": 512
83
+ },
84
+ "semantic_encoder_adapter_kwargs": {
85
+ "d_model": 768,
86
+ "encoder_attention_heads": 12,
87
+ "encoder_ffn_dim": 3072,
88
+ "encoder_layers": 4,
89
+ "input_dim": 768,
90
+ "max_source_positions": 1500,
91
+ "output_dim": 768
92
+ },
93
+ "semantic_encoder_kwargs": {
94
+ "activation_function": "gelu",
95
+ "d_model": 768,
96
+ "encoder_attention_heads": 12,
97
+ "encoder_ffn_dim": 3072,
98
+ "encoder_layers": 12,
99
+ "hop_length": 160,
100
+ "kernel_size": 3,
101
+ "max_audio_seconds": 30,
102
+ "num_mel_bins": 80,
103
+ "sampling_rate": 16000,
104
+ "scale_embedding": false,
105
+ "stride_size": 2
106
+ },
107
+ "upsample_kwargs": {
108
+ "d_model": 768,
109
+ "stride": 4
110
+ },
111
+ "vocos_kwargs": {
112
+ "dim": 512,
113
+ "hop_size": 320,
114
+ "input_channels": 80,
115
+ "intermediate_dim": 4096,
116
+ "n_fft": 1280,
117
+ "num_layers": 30,
118
+ "padding": "same"
119
+ }
120
+ },
121
+ "sampling_rate": 32000,
122
+ "transformers_version": "4.56.1",
123
+ "use_cache": true
124
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5077d3f3445c4ccf1f4a20962c96e448a78c410db53503c87fb53314ae19d68d
3
+ size 2137710040
preprocessor_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "chunk_length": 30,
3
+ "feature_size": 80,
4
+ "hop_length": 160,
5
+ "n_fft": 400,
6
+ "n_samples": 480000,
7
+ "nb_max_frames": 3000,
8
+ "padding_side": "right",
9
+ "padding_value": 0.0,
10
+ "sampling_rate": 16000,
11
+ "return_attention_mask": true,
12
+ "return_tensors": "pt"
13
+ }