Upload folder using huggingface_hub
Browse files- .gitattributes +1 -0
- README.md +145 -0
- chat_template.jinja +51 -0
- config.json +59 -0
- generation_config.json +7 -0
- model.safetensors +3 -0
- special_tokens_map.json +33 -0
- tokenizer.json +3 -0
- tokenizer_config.json +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
library_name: transformers
|
| 3 |
+
base_model:
|
| 4 |
+
- Zyphra/ZAYA1-reasoning-base
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from [Zyphra/ZAYA1-reasoning-base](https://huggingface.co/Zyphra/ZAYA1-reasoning-base).
|
| 8 |
+
|
| 9 |
+
### Example usage:
|
| 10 |
+
|
| 11 |
+
```python
|
| 12 |
+
from transformers import pipeline
|
| 13 |
+
model_id = "yujiepan/zaya1-tiny-random"
|
| 14 |
+
pipe = pipeline('text-generation', model=model_id,
|
| 15 |
+
device='cuda', dtype="bfloat16")
|
| 16 |
+
print(pipe('Hello World!'))
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
### Codes to create this repo:
|
| 20 |
+
|
| 21 |
+
```python
|
| 22 |
+
import json
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
import accelerate
|
| 26 |
+
import torch
|
| 27 |
+
from huggingface_hub import file_exists, hf_hub_download
|
| 28 |
+
from transformers import (
|
| 29 |
+
AutoConfig,
|
| 30 |
+
AutoModelForCausalLM,
|
| 31 |
+
AutoProcessor,
|
| 32 |
+
AutoTokenizer,
|
| 33 |
+
GenerationConfig,
|
| 34 |
+
set_seed,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
source_model_id = "Zyphra/ZAYA1-reasoning-base"
|
| 38 |
+
save_folder = "/tmp/yujiepan/zaya1-tiny-random"
|
| 39 |
+
|
| 40 |
+
processor = AutoTokenizer.from_pretrained(
|
| 41 |
+
source_model_id, trust_remote_code=True)
|
| 42 |
+
processor.save_pretrained(save_folder)
|
| 43 |
+
|
| 44 |
+
with open(hf_hub_download(source_model_id, filename='config.json', repo_type='model'), 'r', encoding='utf-8') as f:
|
| 45 |
+
config_json = json.load(f)
|
| 46 |
+
config_json['hidden_size'] = 512
|
| 47 |
+
config_json['num_attention_heads'] = 4
|
| 48 |
+
config_json['num_key_value_heads'] = 1
|
| 49 |
+
config_json['num_hidden_layers'] = 2
|
| 50 |
+
# bug. need to first set False and then hack
|
| 51 |
+
config_json['tie_word_embeddings'] = False
|
| 52 |
+
config_json['cca_num_q_heads'] = [2, 0]
|
| 53 |
+
config_json['ffn_hidden_size_list'] = [0, 32]
|
| 54 |
+
config_json['num_query_groups_list'] = [1, 0]
|
| 55 |
+
config_json['zaya_layers'] = ['a', 16]
|
| 56 |
+
config_json['zaya_mlp_expansion'] = [0, 8]
|
| 57 |
+
|
| 58 |
+
with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
|
| 59 |
+
json.dump(config_json, f, indent=2)
|
| 60 |
+
|
| 61 |
+
config = AutoConfig.from_pretrained(
|
| 62 |
+
save_folder,
|
| 63 |
+
trust_remote_code=True,
|
| 64 |
+
)
|
| 65 |
+
print(config)
|
| 66 |
+
torch.set_default_dtype(torch.bfloat16)
|
| 67 |
+
model = AutoModelForCausalLM.from_config(config)
|
| 68 |
+
model.lm_head = None
|
| 69 |
+
torch.set_default_dtype(torch.float32)
|
| 70 |
+
if file_exists(filename="generation_config.json", repo_id=source_model_id, repo_type='model'):
|
| 71 |
+
model.generation_config = GenerationConfig.from_pretrained(
|
| 72 |
+
source_model_id, trust_remote_code=True,
|
| 73 |
+
)
|
| 74 |
+
set_seed(42)
|
| 75 |
+
model = model.cpu()
|
| 76 |
+
with torch.no_grad():
|
| 77 |
+
for name, p in sorted(model.named_parameters()):
|
| 78 |
+
torch.nn.init.normal_(p, 0, 0.1)
|
| 79 |
+
print(name, p.shape)
|
| 80 |
+
model.save_pretrained(save_folder)
|
| 81 |
+
with open(f"{save_folder}/config.json", 'r', encoding='utf-8') as f:
|
| 82 |
+
config_json = json.load(f)
|
| 83 |
+
config_json['tie_word_embeddings'] = True
|
| 84 |
+
with open(f"{save_folder}/config.json", "w", encoding='utf-8') as f:
|
| 85 |
+
json.dump(config_json, f, indent=2)
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
### Printing the model:
|
| 89 |
+
|
| 90 |
+
```text
|
| 91 |
+
ZayaForCausalLM(
|
| 92 |
+
(model): ZayaModel(
|
| 93 |
+
(embed_tokens): Embedding(262272, 512, padding_idx=0)
|
| 94 |
+
(layers): ModuleList(
|
| 95 |
+
(0): ZayaDecoderATTLayer(
|
| 96 |
+
(self_attn): ZayaSdpaAttention(
|
| 97 |
+
(o_proj): Linear(in_features=256, out_features=512, bias=False)
|
| 98 |
+
(qkv): CCA(
|
| 99 |
+
(linear_q): Linear(in_features=512, out_features=256, bias=False)
|
| 100 |
+
(linear_k): Linear(in_features=512, out_features=128, bias=False)
|
| 101 |
+
(val_proj1): Linear(in_features=512, out_features=64, bias=False)
|
| 102 |
+
(val_proj2): Linear(in_features=512, out_features=64, bias=False)
|
| 103 |
+
(conv_qk): Sequential(
|
| 104 |
+
(0): Conv1d(384, 384, kernel_size=(2,), stride=(1,), groups=384)
|
| 105 |
+
(1): Conv1d(384, 384, kernel_size=(2,), stride=(1,), groups=3)
|
| 106 |
+
)
|
| 107 |
+
)
|
| 108 |
+
)
|
| 109 |
+
(input_norm): ZayaRMSNorm((512,), eps=1e-05)
|
| 110 |
+
(res_scale): ResidualScaling()
|
| 111 |
+
)
|
| 112 |
+
(1): ZayaDecoderMLPLayer(
|
| 113 |
+
(zaya_block): ZayaBlock(
|
| 114 |
+
(router): ZayaRouter(
|
| 115 |
+
(down_proj): Linear(in_features=512, out_features=8, bias=True)
|
| 116 |
+
(rmsnorm_eda): ZayaRMSNorm((8,), eps=1e-06)
|
| 117 |
+
(non_linearity): GELU(approximate='none')
|
| 118 |
+
(router_mlp): Sequential(
|
| 119 |
+
(0): Linear(in_features=8, out_features=8, bias=True)
|
| 120 |
+
(1): GELU(approximate='none')
|
| 121 |
+
(2): Linear(in_features=8, out_features=8, bias=True)
|
| 122 |
+
(3): GELU(approximate='none')
|
| 123 |
+
(4): Linear(in_features=8, out_features=17, bias=False)
|
| 124 |
+
)
|
| 125 |
+
)
|
| 126 |
+
(experts): SequentialMLP(
|
| 127 |
+
(local_experts): ModuleList(
|
| 128 |
+
(0-15): 16 x MLP(
|
| 129 |
+
(linear_fc1): Linear(in_features=512, out_features=32, bias=False)
|
| 130 |
+
(linear_fc2): Linear(in_features=16, out_features=512, bias=False)
|
| 131 |
+
)
|
| 132 |
+
)
|
| 133 |
+
)
|
| 134 |
+
)
|
| 135 |
+
(input_norm): ZayaRMSNorm((512,), eps=1e-05)
|
| 136 |
+
(res_scale): ResidualScaling()
|
| 137 |
+
)
|
| 138 |
+
)
|
| 139 |
+
(res_scale): ResidualScaling()
|
| 140 |
+
(final_norm): ZayaRMSNorm((512,), eps=1e-05)
|
| 141 |
+
(rotary_emb): ZayaRotaryEmbedding()
|
| 142 |
+
)
|
| 143 |
+
(lm_head): None
|
| 144 |
+
)
|
| 145 |
+
```
|
chat_template.jinja
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if messages[0].role == "system" -%}
|
| 2 |
+
{%- set system_message = messages[0].content -%}
|
| 3 |
+
{%- set custom_instructions = system_message.replace("/no_think", "").replace("/think", "").rstrip() -%}
|
| 4 |
+
{%- endif -%}
|
| 5 |
+
|
| 6 |
+
{%- if custom_instructions -%}
|
| 7 |
+
{{- custom_instructions + "\n\n" -}}
|
| 8 |
+
{%- endif -%}
|
| 9 |
+
|
| 10 |
+
{%- if xml_tools or python_tools or tools -%}
|
| 11 |
+
{{- "### Tools\n\n" -}}
|
| 12 |
+
{%- if xml_tools or tools -%}
|
| 13 |
+
{%- if tools -%}
|
| 14 |
+
{%- set xml_tools = tools -%}
|
| 15 |
+
{%- endif -%}
|
| 16 |
+
{%- set ns = namespace(xml_tool_string="You may call one or more functions to assist with the user query.\nYou are provided with function signatures within <tools></tools> XML tags:\n\n<tools>\n") -%}
|
| 17 |
+
{%- for tool in xml_tools[:] -%} {# The slicing makes sure that xml_tools is a list #}
|
| 18 |
+
{%- set ns.xml_tool_string = ns.xml_tool_string ~ (tool | string) ~ "\n" -%}
|
| 19 |
+
{%- endfor -%}
|
| 20 |
+
{%- set xml_tool_string = ns.xml_tool_string + "</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call>" -%}
|
| 21 |
+
{{- xml_tool_string -}}
|
| 22 |
+
{%- endif -%}
|
| 23 |
+
{%- if python_tools -%}
|
| 24 |
+
{%- set ns = namespace(python_tool_string="When you send a message containing Python code between '<code>' and '</code>' tags, it will be executed in a stateful Jupyter notebook environment, and you will then be given the output to continued reasoning in an agentic loop.\n\nYou can use the following tools in your python code like regular functions:\n<tools>\n") -%}
|
| 25 |
+
{%- for tool in python_tools[:] -%} {# The slicing makes sure that python_tools is a list #}
|
| 26 |
+
{%- set ns.python_tool_string = ns.python_tool_string ~ (tool | string) ~ "\n" -%}
|
| 27 |
+
{%- endfor -%}
|
| 28 |
+
{%- set python_tool_string = ns.python_tool_string + "</tools>\n\nThe state persists between code executions: so variables that you define in one step are still available thereafter." -%}
|
| 29 |
+
{{- python_tool_string -}}
|
| 30 |
+
{%- endif -%}
|
| 31 |
+
{{- "\n\n" -}}
|
| 32 |
+
{%- endif -%}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
{# ───── main loop ───── #}
|
| 36 |
+
{%- for message in messages -%}
|
| 37 |
+
{%- set content = message.content if message.content is string else "" -%}
|
| 38 |
+
{%- if message.role == "user" -%}
|
| 39 |
+
{{- message.role + ": " + content + "\n" -}}
|
| 40 |
+
{%- elif message.role == "assistant" -%}
|
| 41 |
+
{% generation %}
|
| 42 |
+
{{- "assistant: " + content.lstrip("\n") + "\n" -}}
|
| 43 |
+
{% endgeneration %}
|
| 44 |
+
{%- elif message.role == "tool" -%}
|
| 45 |
+
{{- "tool: " + content + "\n" -}}
|
| 46 |
+
{%- endif -%}
|
| 47 |
+
{%- endfor -%}
|
| 48 |
+
{# ───── generation prompt ───── #}
|
| 49 |
+
{%- if add_generation_prompt -%}
|
| 50 |
+
{{- "assistant: " -}}
|
| 51 |
+
{%- endif -%}
|
config.json
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"activation_func": "swiglu",
|
| 3 |
+
"activation_func_fp8_input_store": false,
|
| 4 |
+
"add_bias_linear": false,
|
| 5 |
+
"architectures": [
|
| 6 |
+
"ZayaForCausalLM"
|
| 7 |
+
],
|
| 8 |
+
"attention_bias": false,
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"bias_activation_fusion": true,
|
| 11 |
+
"bos_token_id": 2,
|
| 12 |
+
"cca": true,
|
| 13 |
+
"cca_num_q_heads": [
|
| 14 |
+
2,
|
| 15 |
+
0
|
| 16 |
+
],
|
| 17 |
+
"dtype": "bfloat16",
|
| 18 |
+
"eos_token_id": 1,
|
| 19 |
+
"ffn_hidden_size_list": [
|
| 20 |
+
0,
|
| 21 |
+
32
|
| 22 |
+
],
|
| 23 |
+
"gated_linear_unit": true,
|
| 24 |
+
"hidden_size": 512,
|
| 25 |
+
"kv_channels": 128,
|
| 26 |
+
"lm_head_bias": false,
|
| 27 |
+
"max_position_embeddings": 32768,
|
| 28 |
+
"model_type": "zaya",
|
| 29 |
+
"moe_router_topk": 1,
|
| 30 |
+
"norm_epsilon": 1e-05,
|
| 31 |
+
"num_attention_heads": 4,
|
| 32 |
+
"num_hidden_layers": 2,
|
| 33 |
+
"num_key_value_heads": 1,
|
| 34 |
+
"num_query_groups_list": [
|
| 35 |
+
1,
|
| 36 |
+
0
|
| 37 |
+
],
|
| 38 |
+
"pad_token_id": 0,
|
| 39 |
+
"partial_rotary_factor": 0.5,
|
| 40 |
+
"residual_in_fp32": false,
|
| 41 |
+
"rope_scaling": false,
|
| 42 |
+
"rope_theta": 1000000,
|
| 43 |
+
"scale_residual_merge": true,
|
| 44 |
+
"sliding_window": null,
|
| 45 |
+
"tie_word_embeddings": true,
|
| 46 |
+
"transformers_version": "4.57.1",
|
| 47 |
+
"use_cache": true,
|
| 48 |
+
"vocab_size": 262272,
|
| 49 |
+
"zaya_layers": [
|
| 50 |
+
"a",
|
| 51 |
+
16
|
| 52 |
+
],
|
| 53 |
+
"zaya_mlp_expansion": [
|
| 54 |
+
0,
|
| 55 |
+
8
|
| 56 |
+
],
|
| 57 |
+
"zaya_use_eda": true,
|
| 58 |
+
"zaya_use_mod": true
|
| 59 |
+
}
|
generation_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_from_model_config": true,
|
| 3 |
+
"bos_token_id": 2,
|
| 4 |
+
"eos_token_id": 1,
|
| 5 |
+
"pad_token_id": 0,
|
| 6 |
+
"transformers_version": "4.57.1"
|
| 7 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:52b9d7ee31d844fc78c9b4612b7eeb0400389ef690c7a5c1c6b97f3406a4436b
|
| 3 |
+
size 270369518
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"boi_token": "<start_of_image>",
|
| 3 |
+
"bos_token": {
|
| 4 |
+
"content": "<bos>",
|
| 5 |
+
"lstrip": false,
|
| 6 |
+
"normalized": false,
|
| 7 |
+
"rstrip": false,
|
| 8 |
+
"single_word": false
|
| 9 |
+
},
|
| 10 |
+
"eoi_token": "<end_of_image>",
|
| 11 |
+
"eos_token": {
|
| 12 |
+
"content": "<eos>",
|
| 13 |
+
"lstrip": false,
|
| 14 |
+
"normalized": false,
|
| 15 |
+
"rstrip": false,
|
| 16 |
+
"single_word": false
|
| 17 |
+
},
|
| 18 |
+
"image_token": "<image_soft_token>",
|
| 19 |
+
"pad_token": {
|
| 20 |
+
"content": "<pad>",
|
| 21 |
+
"lstrip": false,
|
| 22 |
+
"normalized": false,
|
| 23 |
+
"rstrip": false,
|
| 24 |
+
"single_word": false
|
| 25 |
+
},
|
| 26 |
+
"unk_token": {
|
| 27 |
+
"content": "<unk>",
|
| 28 |
+
"lstrip": false,
|
| 29 |
+
"normalized": false,
|
| 30 |
+
"rstrip": false,
|
| 31 |
+
"single_word": false
|
| 32 |
+
}
|
| 33 |
+
}
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7b244434a1e668213b5494c816f8077d7b5b64bdac094ea09e7aaf6281b77f00
|
| 3 |
+
size 33384937
|
tokenizer_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|