yujiepan commited on
Commit
4561610
·
verified ·
1 Parent(s): f55348d

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ base_model:
4
+ - Zyphra/ZAYA1-reasoning-base
5
+ ---
6
+
7
+ This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [Zyphra/ZAYA1-reasoning-base](https://huggingface.co/Zyphra/ZAYA1-reasoning-base).
8
+
9
+ ### Example usage:
10
+
11
+ ```python
12
+ from transformers import pipeline
13
+ model_id = "yujiepan/zaya1-tiny-random"
14
+ pipe = pipeline('text-generation', model=model_id,
15
+ device='cuda', dtype="bfloat16")
16
+ print(pipe('Hello World!'))
17
+ ```
18
+
19
+ ### Codes to create this repo:
20
+
21
+ ```python
22
+ import json
23
+ from pathlib import Path
24
+
25
+ import accelerate
26
+ import torch
27
+ from huggingface_hub import file_exists, hf_hub_download
28
+ from transformers import (
29
+ AutoConfig,
30
+ AutoModelForCausalLM,
31
+ AutoProcessor,
32
+ AutoTokenizer,
33
+ GenerationConfig,
34
+ set_seed,
35
+ )
36
+
37
+ source_model_id = "Zyphra/ZAYA1-reasoning-base"
38
+ save_folder = "/tmp/yujiepan/zaya1-tiny-random"
39
+
40
+ processor = AutoTokenizer.from_pretrained(
41
+ source_model_id, trust_remote_code=True)
42
+ processor.save_pretrained(save_folder)
43
+
44
+ with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
45
+ config_json = json.load(f)
46
+ config_json['hidden_size'] = 512
47
+ config_json['num_attention_heads'] = 4
48
+ config_json['num_key_value_heads'] = 1
49
+ config_json['num_hidden_layers'] = 2
50
+ # bug. need to first set False and then hack
51
+ config_json['tie_word_embeddings'] = False
52
+ config_json['cca_num_q_heads'] = [2, 0]
53
+ config_json['ffn_hidden_size_list'] = [0, 32]
54
+ config_json['num_query_groups_list'] = [1, 0]
55
+ config_json['zaya_layers'] = ['a', 16]
56
+ config_json['zaya_mlp_expansion'] = [0, 8]
57
+
58
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
59
+ json.dump(config_json, f, indent=2)
60
+
61
+ config = AutoConfig.from_pretrained(
62
+ save_folder,
63
+ trust_remote_code=True,
64
+ )
65
+ print(config)
66
+ torch.set_default_dtype(torch.bfloat16)
67
+ model = AutoModelForCausalLM.from_config(config)
68
+ model.lm_head = None
69
+ torch.set_default_dtype(torch.float32)
70
+ if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
71
+ model.generation_config = GenerationConfig.from_pretrained(
72
+ source_model_id, trust_remote_code=True,
73
+ )
74
+ set_seed(42)
75
+ model = model.cpu()
76
+ with torch.no_grad():
77
+ for name, p in sorted(model.named_parameters()):
78
+ torch.nn.init.normal_(p, 0, 0.1)
79
+ print(name, p.shape)
80
+ model.save_pretrained(save_folder)
81
+ with open(f"{save_folder}/config.json", 'r', encoding='utf-8') as f:
82
+ config_json = json.load(f)
83
+ config_json['tie_word_embeddings'] = True
84
+ with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
85
+ json.dump(config_json, f, indent=2)
86
+ ```
87
+
88
+ ### Printing the model:
89
+
90
+ ```text
91
+ ZayaForCausalLM(
92
+ (model): ZayaModel(
93
+ (embed_tokens): Embedding(262272, 512, padding_idx=0)
94
+ (layers): ModuleList(
95
+ (0): ZayaDecoderATTLayer(
96
+ (self_attn): ZayaSdpaAttention(
97
+ (o_proj): Linear(in_features=256, out_features=512, bias=False)
98
+ (qkv): CCA(
99
+ (linear_q): Linear(in_features=512, out_features=256, bias=False)
100
+ (linear_k): Linear(in_features=512, out_features=128, bias=False)
101
+ (val_proj1): Linear(in_features=512, out_features=64, bias=False)
102
+ (val_proj2): Linear(in_features=512, out_features=64, bias=False)
103
+ (conv_qk): Sequential(
104
+ (0): Conv1d(384, 384, kernel_size=(2,), stride=(1,), groups=384)
105
+ (1): Conv1d(384, 384, kernel_size=(2,), stride=(1,), groups=3)
106
+ )
107
+ )
108
+ )
109
+ (input_norm): ZayaRMSNorm((512,), eps=1e-05)
110
+ (res_scale): ResidualScaling()
111
+ )
112
+ (1): ZayaDecoderMLPLayer(
113
+ (zaya_block): ZayaBlock(
114
+ (router): ZayaRouter(
115
+ (down_proj): Linear(in_features=512, out_features=8, bias=True)
116
+ (rmsnorm_eda): ZayaRMSNorm((8,), eps=1e-06)
117
+ (non_linearity): GELU(approximate='none')
118
+ (router_mlp): Sequential(
119
+ (0): Linear(in_features=8, out_features=8, bias=True)
120
+ (1): GELU(approximate='none')
121
+ (2): Linear(in_features=8, out_features=8, bias=True)
122
+ (3): GELU(approximate='none')
123
+ (4): Linear(in_features=8, out_features=17, bias=False)
124
+ )
125
+ )
126
+ (experts): SequentialMLP(
127
+ (local_experts): ModuleList(
128
+ (0-15): 16 x MLP(
129
+ (linear_fc1): Linear(in_features=512, out_features=32, bias=False)
130
+ (linear_fc2): Linear(in_features=16, out_features=512, bias=False)
131
+ )
132
+ )
133
+ )
134
+ )
135
+ (input_norm): ZayaRMSNorm((512,), eps=1e-05)
136
+ (res_scale): ResidualScaling()
137
+ )
138
+ )
139
+ (res_scale): ResidualScaling()
140
+ (final_norm): ZayaRMSNorm((512,), eps=1e-05)
141
+ (rotary_emb): ZayaRotaryEmbedding()
142
+ )
143
+ (lm_head): None
144
+ )
145
+ ```
chat_template.jinja ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0].role == "system" -%}
2
+ {%- set system_message = messages[0].content -%}
3
+ {%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
4
+ {%- endif -%}
5
+
6
+ {%- if custom_instructions -%}
7
+ {{- custom_instructions + "\n\n" -}}
8
+ {%- endif -%}
9
+
10
+ {%- if xml_tools or python_tools or tools -%}
11
+ {{- "### Tools\n\n" -}}
12
+ {%- if xml_tools or tools -%}
13
+ {%- if tools -%}
14
+ {%- set xml_tools = tools -%}
15
+ {%- endif -%}
16
+ {%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
17
+ {%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
18
+ {%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
19
+ {%- endfor -%}
20
+ {%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
21
+ {{- xml_tool_string -}}
22
+ {%- endif -%}
23
+ {%- if python_tools -%}
24
+ {%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
25
+ {%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
26
+ {%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
27
+ {%- endfor -%}
28
+ {%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
29
+ {{- python_tool_string -}}
30
+ {%- endif -%}
31
+ {{- "\n\n" -}}
32
+ {%- endif -%}
33
+
34
+
35
+ {# ───── main loop ───── #}
36
+ {%- for message in messages -%}
37
+ {%- set content = message.content if message.content is string else "" -%}
38
+ {%- if message.role == "user" -%}
39
+ {{- message.role + ": " + content + "\n" -}}
40
+ {%- elif message.role == "assistant" -%}
41
+ {% generation %}
42
+ {{- "assistant: " + content.lstrip("\n") + "\n" -}}
43
+ {% endgeneration %}
44
+ {%- elif message.role == "tool" -%}
45
+ {{- "tool: " + content + "\n" -}}
46
+ {%- endif -%}
47
+ {%- endfor -%}
48
+ {# ───── generation prompt ───── #}
49
+ {%- if add_generation_prompt -%}
50
+ {{- "assistant: " -}}
51
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_func": "swiglu",
3
+ "activation_func_fp8_input_store": false,
4
+ "add_bias_linear": false,
5
+ "architectures": [
6
+ "ZayaForCausalLM"
7
+ ],
8
+ "attention_bias": false,
9
+ "attention_dropout": 0.0,
10
+ "bias_activation_fusion": true,
11
+ "bos_token_id": 2,
12
+ "cca": true,
13
+ "cca_num_q_heads": [
14
+ 2,
15
+ 0
16
+ ],
17
+ "dtype": "bfloat16",
18
+ "eos_token_id": 1,
19
+ "ffn_hidden_size_list": [
20
+ 0,
21
+ 32
22
+ ],
23
+ "gated_linear_unit": true,
24
+ "hidden_size": 512,
25
+ "kv_channels": 128,
26
+ "lm_head_bias": false,
27
+ "max_position_embeddings": 32768,
28
+ "model_type": "zaya",
29
+ "moe_router_topk": 1,
30
+ "norm_epsilon": 1e-05,
31
+ "num_attention_heads": 4,
32
+ "num_hidden_layers": 2,
33
+ "num_key_value_heads": 1,
34
+ "num_query_groups_list": [
35
+ 1,
36
+ 0
37
+ ],
38
+ "pad_token_id": 0,
39
+ "partial_rotary_factor": 0.5,
40
+ "residual_in_fp32": false,
41
+ "rope_scaling": false,
42
+ "rope_theta": 1000000,
43
+ "scale_residual_merge": true,
44
+ "sliding_window": null,
45
+ "tie_word_embeddings": true,
46
+ "transformers_version": "4.57.1",
47
+ "use_cache": true,
48
+ "vocab_size": 262272,
49
+ "zaya_layers": [
50
+ "a",
51
+ 16
52
+ ],
53
+ "zaya_mlp_expansion": [
54
+ 0,
55
+ 8
56
+ ],
57
+ "zaya_use_eda": true,
58
+ "zaya_use_mod": true
59
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.57.1"
7
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:52b9d7ee31d844fc78c9b4612b7eeb0400389ef690c7a5c1c6b97f3406a4436b
3
+ size 270369518
special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b244434a1e668213b5494c816f8077d7b5b64bdac094ea09e7aaf6281b77f00
3
+ size 33384937
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff