{ "_comment": "DeepSeek-Nano: Arquitetura DeepSeek-V3 escalonada para ~100k parĂ¢metros ativos/token", "model_type": "deepseek_nano", "architectures": ["DeepSeekNanoForCausalLM"], "auto_map": { "AutoConfig": "modeling_axion.AxionConfig", "AutoModelForCausalLM": "modeling_axion.DeepSeekNanoForCausalLM" }, "vocab_size": 1024, "d_model": 64, "n_layers": 4, "n_heads": 4, "d_head": 16, "kv_lora_rank": 8, "q_lora_rank": 16, "rope_theta": 10000.0, "rope_scaling": null, "n_shared_experts": 1, "n_routed_experts": 4, "n_active_experts": 2, "d_ff": 64, "moe_aux_loss_coef": 0.0, "expert_bias_init": 0.0, "max_seq_len": 512, "dropout": 0.0, "norm_eps": 1e-6, "tie_embeddings": true, "pad_token_id": 0, "bos_token_id": 1, "eos_token_id": 2, "unk_token_id": 3, "_param_estimate": { "embedding": 65536, "per_layer_approx": 69632, "total_approx": 344064, "active_per_token": "~160k (MLA + 1 shared + 2 routed experts + norms)" } }